FOSSology  4.4.0
Open Source License Compliance by Open Source Software
cleanEntries.cc
Go to the documentation of this file.
1 /*
2  SPDX-FileCopyrightText: © 2014-2015,2022 Siemens AG
3  Author: Johannes Najjar
4 
5  SPDX-License-Identifier: GPL-2.0-only
6 */
7 
15 #include "cleanEntries.hpp"
16 #include <sstream>
17 #include <iterator>
18 using std::stringstream;
19 using std::ostream_iterator;
20 
29 string cleanGeneral(string::const_iterator sBegin, string::const_iterator sEnd)
30 {
31  stringstream ss;
32  rx::regex_replace(ostream_iterator<char>(ss), sBegin, sEnd, rx::regex("[[:space:]\\x0-\\x1f]{2,}"), " ");
33  string s = ss.str();
34  string::size_type len = s.length();
35  if (len > 1)
36  {
37  char cBegin = s[0];
38  char cEnd = s[len - 1];
39  if (cBegin == ' ' && cEnd == ' ')
40  return s.substr(1, len - 2);
41  else if (cBegin == ' ')
42  return s.substr(1);
43  else if (cEnd == ' ')
44  return s.substr(0, len - 1);
45  }
46  // Only one character/space??? Should not be possible
47  return s == " " ? "" : s;
48 }
49 
56 string cleanSpdxStatement(string::const_iterator sBegin, string::const_iterator sEnd)
57 {
58  stringstream ss;
59  rx::regex_replace(ostream_iterator<char>(ss), sBegin, sEnd, rx::regex("spdx-filecopyrighttext:", rx::regex_constants::icase), " ");
60  string s = ss.str();
61  return cleanGeneral(s.begin(), s.end());
62 }
63 
71 string cleanStatement(string::const_iterator sBegin, string::const_iterator sEnd)
72 {
73  stringstream ss;
74  rx::regex_replace(ostream_iterator<char>(ss), sBegin, sEnd, rx::regex("\n[[:space:][:punct:]]*"), " ");
75  string s = ss.str();
76  return cleanSpdxStatement(s.begin(), s.end());
77 }
78 
88 string cleanNonPrint(string::const_iterator sBegin, string::const_iterator sEnd)
89 {
90  string s(sBegin, sEnd);
91  const unsigned char *in = reinterpret_cast<const unsigned char*>(s.c_str());
92  int len = s.length();
93 
94  icu::UnicodeString out;
95  for (int i = 0; i < len;)
96  {
97  UChar32 uniChar;
98  size_t lastPos = i;
99  U8_NEXT(in, i, len, uniChar); // Get next UTF-8 char
100  if (uniChar > 0)
101  {
102  out.append(uniChar);
103  }
104  else
105  {
106  i = lastPos; // Rest pointer
107  U16_NEXT(in, i, len, uniChar); // Try to get failed input as UTF-16
108  if (U_IS_UNICODE_CHAR(uniChar) && uniChar > 0)
109  {
110  out.append(uniChar);
111  }
112  }
113  }
114  out.trim();
115 
116  string ret;
117  out.toUTF8String(ret);
118  return ret;
119 }
120 
129 string cleanMatch(const string& sText, const match& m)
130 {
131  string::const_iterator it = sText.begin();
132  icu::UnicodeString unicodeStr = fo::recodeToUnicode(string(it + m.start,
133  it + m.end));
134  string utfCompatibleText;
135 
136  unicodeStr.toUTF8String(utfCompatibleText);
137 
138  if (m.type == "statement")
139  return cleanStatement(utfCompatibleText.begin(), utfCompatibleText.end());
140  else
141  return cleanGeneral(utfCompatibleText.begin(), utfCompatibleText.end());
142 }
143 
string cleanGeneral(string::const_iterator sBegin, string::const_iterator sEnd)
Trim space at beginning and end.
Definition: cleanEntries.cc:29
int s
The socket that the CLI will use to communicate.
Definition: fo_cli.c:37
icu::UnicodeString recodeToUnicode(const std::string &input)
Definition: libfossUtils.cc:32
Store the results of a regex match.
Definition: scanners.hpp:28
const int start
Definition: scanners.hpp:35
const int end
Definition: scanners.hpp:35