FOSSology  4.7.0-rc1
Open Source License Compliance by Open Source Software
copyscan.cc
1 /*
2  SPDX-FileCopyrightText: © 2015,2022, Siemens AG
3  Author: Florian Krügel
4 
5  SPDX-License-Identifier: GPL-2.0-only
6 */
7 
8 #include "copyscan.hpp"
9 #include <cctype>
10 #include <algorithm>
11 #include "regexConfProvider.hpp"
12 
13 extern "C" {
14 #include "libfossology.h"
15 }
16 
17 const string copyrightType("statement");
25 {
27  rcp.maybeLoad("copyright");
28 
29  regCopyright = rx::regex(rcp.getRegexValue("copyright","REG_COPYRIGHT"),
30  rx::regex_constants::icase);
31 
32  regException = rx::regex(rcp.getRegexValue("copyright","REG_EXCEPTION"),
33  rx::regex_constants::icase);
34  regNonBlank = rx::regex(rcp.getRegexValue("copyright","REG_NON_BLANK"));
35 
36  regSimpleCopyright = rx::regex(rcp.getRegexValue("copyright","REG_SIMPLE_COPYRIGHT"),
37  rx::regex_constants::icase);
38  regSpdxCopyright = rx::regex(rcp.getRegexValue("copyright","REG_SPDX_COPYRIGHT"),
39  rx::regex_constants::icase);
40 
41  // Cleanup
42  regExceptionCopy = rx::regex(rcp.getRegexValue("copyright","REG_EXCEPTION_COPY"),
43  rx::regex_constants::icase);
44  regRemoveFileStmt = rx::regex(rcp.getRegexValue("copyright","REG_REMOVE_FILE_STATEMENT"),
45  rx::regex_constants::icase);
46  regStripLicenseTrail = rx::regex(rcp.getRegexValue("copyright", "REG_STRIP_LICENSE_TRAIL"),
47  rx::regex_constants::icase);
48  regStripTrademarkTrail = rx::regex(rcp.getRegexValue("copyright", "REG_STRIP_TRADEMARK_TRAIL"),
49  rx::regex_constants::icase);
50  regStripAllRightReserveTrail = rx::regex(rcp.getRegexValue("copyright", "REG_ALL_RIGHT_RESERVE_TRAIL"),
51  rx::regex_constants::icase);
52  regExceptionVerbFollow = rx::regex(rcp.getRegexValue("copyright", "REG_EXCEPTION_VERB_FOLLOW"),
53  rx::regex_constants::icase);
54  regExceptionAdjectivePrefix = rx::regex(rcp.getRegexValue("copyright", "REG_EXCEPTION_ADJECTIVE_PREFIX"),
55  rx::regex_constants::icase);
56  regExceptionTemplate = rx::regex(rcp.getRegexValue("copyright", "REG_EXCEPTION_TEMPLATE"),
57  rx::regex_constants::icase);
58  regExceptionPassive = rx::regex(rcp.getRegexValue("copyright", "REG_EXCEPTION_PASSIVE"),
59  rx::regex_constants::icase);
60  regStripCopySymNonYear = rx::regex(rcp.getRegexValue("copyright", "REG_STRIP_COPYSYM_NONYEAR"),
61  rx::regex_constants::icase);
62  regExceptionBinaryNoise = rx::regex(rcp.getRegexValue("copyright", "REG_EXCEPTION_BINARY_NOISE"),
63  rx::regex_constants::icase);
64  regExceptionMeta = rx::regex(rcp.getRegexValue("copyright", "REG_EXCEPTION_META"),
65  rx::regex_constants::icase);
66  regExceptionCharNameRun = rx::regex(rcp.getRegexValue("copyright", "REG_EXCEPTION_CHARNAME_RUN"),
67  rx::regex_constants::icase);
68 }
69 
78 void hCopyrightScanner::ScanString(const string& s, list<match>& out) const
79 {
80 
81  string::const_iterator begin = s.begin();
82  string::const_iterator pos = begin;
83  string::const_iterator end = s.end();
84  while (pos != end)
85  {
86  // Find potential copyright statement
87  rx::smatch results;
88  if (!rx::regex_search(pos, end, results, regCopyright))
89  // No further copyright statement found
90  break;
91  string::const_iterator foundPos = results[0].first;
92 
93  if (!rx::regex_match(foundPos, end, regException))
94  {
105  string::const_iterator j = find(foundPos, end, '\n');
106  while (j != end)
107  {
108  string::const_iterator beginOfLine = j;
109  ++beginOfLine;
110  string::const_iterator endOfLine = find(beginOfLine, end, '\n');
111  if (rx::regex_search(beginOfLine, endOfLine, regSpdxCopyright)){
112  // Found end
113  break;
114  }
115  if (rx::regex_search(beginOfLine, endOfLine, regSimpleCopyright)
116  || !rx::regex_match(beginOfLine, endOfLine, regNonBlank))
117  {
118  // Found end
119  break;
120  }
121  j = endOfLine;
122  }
123  string raw = string(foundPos, j);
124  CleanupResult result = Cleanup(raw);
125 
126  if (result.disposition == CleanupResult::Disposition::DISCARD) {
127  // Definitively not a copyright
128  pos = j;
129  continue;
130  }
131 
132  if (result.disposition == CleanupResult::Disposition::DEACTIVATE) {
133  int startPos = foundPos - begin;
134  int endPos = j - begin;
135  if (endPos - startPos > 300)
136  endPos = startPos + 300;
137  out.push_back(match(startPos, endPos, copyrightType, false));
138  pos = j;
139  continue;
140  }
141 
142  string& cleaned = result.content;
143  if (cleaned.size() > 300)
144  cleaned = cleaned.substr(0, 300);
145 
146  out.push_back(match(foundPos - begin, (foundPos - begin) + cleaned.size(), copyrightType));
147  pos = j;
148  }
149  else
150  {
151  // An exception: this is not a copyright statement: continue at the end of this statement
152  pos = results[0].second;
153  }
154  }
155 }
156 
157 CleanupResult hCopyrightScanner::Cleanup(const string &raw) const {
158  using Disposition = CleanupResult::Disposition;
159 
160  if (rx::regex_match(raw, regRemoveFileStmt)) {
161  return {"", Disposition::DEACTIVATE};
162  }
163 
164  string cleaned = raw;
165  cleaned = rx::regex_replace(cleaned, regStripLicenseTrail, "");
166  cleaned = rx::regex_replace(cleaned, regStripTrademarkTrail, "");
167  cleaned = rx::regex_replace(cleaned, regStripAllRightReserveTrail, "");
168  cleaned = rx::regex_replace(cleaned, regStripCopySymNonYear, "");
169 
170  // DEACTIVATE
171  if (rx::regex_search(cleaned, regExceptionTemplate) ||
172  rx::regex_search(cleaned, regExceptionBinaryNoise) ||
173  rx::regex_search(cleaned, regExceptionMeta) ||
174  rx::regex_search(cleaned, regExceptionCharNameRun)) {
175  return {"", Disposition::DEACTIVATE};
176  }
177 
178  // DEACTIVATE: Limit scope to the first copyright context only.
179  string deactivateScope = cleaned;
180  {
181  string lower = cleaned;
182  transform(lower.begin(), lower.end(), lower.begin(), ::tolower);
183  size_t first = lower.find("copyright");
184  if (first != string::npos) {
185  size_t second = lower.find("copyright", first + 9);
186  if (second != string::npos) {
187  deactivateScope = cleaned.substr(0, second);
188  }
189  }
190  }
191 
192  if (rx::regex_search(deactivateScope, regExceptionCopy) ||
193  rx::regex_search(deactivateScope, regExceptionVerbFollow) ||
194  rx::regex_search(deactivateScope, regExceptionAdjectivePrefix) ||
195  rx::regex_search(deactivateScope, regExceptionPassive)) {
196  return {"", Disposition::DEACTIVATE};
197  }
198 
199  RemoveNoisePatterns(cleaned);
200  TrimPunctuation(cleaned);
201  NormalizeCopyright(cleaned);
202  StripSuffixes(cleaned);
203 
204  if (cleaned.empty()) {
205  return {"", Disposition::DISCARD};
206  }
207 
208  return {cleaned, Disposition::KEEP};
209 }
210 
211 void hCopyrightScanner::TrimPunctuation(string &text) const{
212  const string trimCharsAll = ",\'\"-:;&@!";
213  const string trimStartOnly = ".>)]\\/";
214  const string trimEndOnly = "<([\\/";
215 
216  size_t start = text.find_first_not_of(trimCharsAll);
217  size_t end = text.find_last_not_of(trimCharsAll);
218 
219  if (start == string::npos) {
220  text.clear();
221  return;
222  }
223 
224  text = text.substr(start, end - start + 1);
225 
226  while (!text.empty() && trimStartOnly.find(text.front()) != string::npos) {
227  text.erase(0, 1);
228  }
229 
230  while (!text.empty() && trimEndOnly.find(text.back()) != string::npos) {
231  text.pop_back();
232  }
233 }
234 
235 void hCopyrightScanner::RemoveNoisePatterns(string& text) const{
236  const vector<string> patterns = {
237  "<p>", "<a href", "date-of-software", "date-of-document",
238  " $ ", " ? ", "</a>", "( )", "()"
239  };
240 
241  for (const auto& word : patterns) {
242  size_t pos;
243  while ((pos = text.find(word)) != string::npos) {
244  text.replace(pos, word.length(), " ");
245  }
246  }
247 }
248 
249 void hCopyrightScanner::NormalizeCopyright(string& text) const {
250  const vector<pair<string, string>> replacements = {
251  {"SPDX-FileCopyrightText", "Copyright"},
252  {"AssemblyCopyright", "Copyright"},
253  {"AppCopyright", "Copyright"},
254  {"JCOPYRIGHT", "Copyright"},
255  {"COPYRIGHT Copyright", "Copyright"},
256  {"Copyright Copyright", "Copyright"},
257  {"Copyright copyright", "Copyright"},
258  {"copyright copyright", "Copyright"},
259  {"copyright Copyright", "Copyright"},
260  {"copyright\"Copyright", "Copyright"},
261  {"copyright\" Copyright", "Copyright"}
262  };
263 
264  for (const auto& pair : replacements) {
265  const string& from = pair.first;
266  const string& to = pair.second;
267 
268  size_t pos;
269  while ((pos = text.find(from)) != string::npos) {
270  text.replace(pos, from.length(), to);
271  }
272  }
273 }
274 
275 void hCopyrightScanner::StripSuffixes(string& text) const{
276  const vector<string> suffixes = {
277  "copyright", ",", "year", "parts", "0", "1", "author", "all", "some", "and"
278  };
279 
280  for (const auto& suffix : suffixes) {
281  if (text.length() > suffix.length() + 1 &&
282  text.size() >= suffix.size() &&
283  text.compare(text.size() - suffix.size(), suffix.size(), suffix) == 0)
284  {
285  text.erase(text.size() - suffix.size());
286  break;
287  }
288  }
289 }
290 
291 
292 
Provide regex using conf file.
void maybeLoad(const std::string &identity)
Check if identity already loaded in RegexMap, if not load them.
const char * getRegexValue(const std::string &name, const std::string &key)
Get the regex as string from the RegexMap.
rx::regex regNonBlank
Definition: copyscan.hpp:61
void ScanString(const string &s, list< match > &results) const
Scan a given string for copyright statements.
Definition: copyscan.cc:78
rx::regex regSpdxCopyright
Definition: copyscan.hpp:62
hCopyrightScanner()
Constructor for default hCopyrightScanner.
Definition: copyscan.cc:24
rx::regex regCopyright
Definition: copyscan.hpp:61
rx::regex regSimpleCopyright
Definition: copyscan.hpp:61
rx::regex regException
Definition: copyscan.hpp:61
int s
The socket that the CLI will use to communicate.
Definition: fo_cli.c:37
The main FOSSology C library.
start($application)
start the application Assumes application is restartable via /etc/init.d/<script>....
Definition: pkgConfig.php:1214
Outcome of the Cleanup() function.
Definition: copyscan.hpp:20
list_t type structure used to keep various lists. (e.g. there are multiple lists).
Definition: nomos.h:308
Store the results of a regex match.
Definition: scanners.hpp:28