FOSSology  4.7.1
Open Source License Compliance by Open Source Software
copyscan.cc
1 /*
2  SPDX-FileCopyrightText: © 2015,2022, Siemens AG
3  Author: Florian Krügel
4 
5  SPDX-License-Identifier: GPL-2.0-only
6 */
7 
8 #include "copyscan.hpp"
9 #include <cctype>
10 #include <algorithm>
11 #include "regexConfProvider.hpp"
12 
13 extern "C" {
14 #include "libfossology.h"
15 }
16 
17 const string copyrightType("statement");
25 {
27  rcp.maybeLoad("copyright");
28 
29  // Compiled once per scanner instance; optimize pre-builds DFA states so
30  // repeated regex_search calls across many files pay no recompilation cost.
31  const auto icaseOptimize = rx::regex_constants::icase | rx::regex_constants::optimize;
32 
33  regCopyright = rx::regex(rcp.getRegexValue("copyright","REG_COPYRIGHT"), icaseOptimize);
34  regException = rx::regex(rcp.getRegexValue("copyright","REG_EXCEPTION"), icaseOptimize);
35  regNonBlank = rx::regex(rcp.getRegexValue("copyright","REG_NON_BLANK"), rx::regex_constants::optimize);
36  regSimpleCopyright = rx::regex(rcp.getRegexValue("copyright","REG_SIMPLE_COPYRIGHT"), icaseOptimize);
37  regSpdxCopyright = rx::regex(rcp.getRegexValue("copyright","REG_SPDX_COPYRIGHT"), icaseOptimize);
38 
39  // Cleanup
40  regExceptionCopy = rx::regex(rcp.getRegexValue("copyright","REG_EXCEPTION_COPY"), icaseOptimize);
41  regRemoveFileStmt = rx::regex(rcp.getRegexValue("copyright","REG_REMOVE_FILE_STATEMENT"), icaseOptimize);
42  regStripLicenseTrail = rx::regex(rcp.getRegexValue("copyright","REG_STRIP_LICENSE_TRAIL"), icaseOptimize);
43  regStripTrademarkTrail = rx::regex(rcp.getRegexValue("copyright","REG_STRIP_TRADEMARK_TRAIL"), icaseOptimize);
44  regStripAllRightReserveTrail = rx::regex(rcp.getRegexValue("copyright","REG_ALL_RIGHT_RESERVE_TRAIL"), icaseOptimize);
45  regExceptionVerbFollow = rx::regex(rcp.getRegexValue("copyright","REG_EXCEPTION_VERB_FOLLOW"), icaseOptimize);
46  regExceptionAdjectivePrefix = rx::regex(rcp.getRegexValue("copyright","REG_EXCEPTION_ADJECTIVE_PREFIX"), icaseOptimize);
47  regExceptionTemplate = rx::regex(rcp.getRegexValue("copyright","REG_EXCEPTION_TEMPLATE"), icaseOptimize);
48  regExceptionPassive = rx::regex(rcp.getRegexValue("copyright","REG_EXCEPTION_PASSIVE"), icaseOptimize);
49  regStripCopySymNonYear = rx::regex(rcp.getRegexValue("copyright","REG_STRIP_COPYSYM_NONYEAR"), icaseOptimize);
50  regExceptionBinaryNoise = rx::regex(rcp.getRegexValue("copyright","REG_EXCEPTION_BINARY_NOISE"), icaseOptimize);
51  regExceptionMeta = rx::regex(rcp.getRegexValue("copyright","REG_EXCEPTION_META"), icaseOptimize);
52  regExceptionCharNameRun = rx::regex(rcp.getRegexValue("copyright","REG_EXCEPTION_CHARNAME_RUN"), icaseOptimize);
53 }
54 
63 void hCopyrightScanner::ScanString(const string& s, list<match>& out) const
64 {
65 
66  string::const_iterator begin = s.begin();
67  string::const_iterator pos = begin;
68  string::const_iterator end = s.end();
69  while (pos != end)
70  {
71  // Find potential copyright statement
72  rx::smatch results;
73  if (!rx::regex_search(pos, end, results, regCopyright))
74  // No further copyright statement found
75  break;
76  string::const_iterator foundPos = results[0].first;
77 
78  if (!rx::regex_match(foundPos, end, regException))
79  {
90  string::const_iterator j = find(foundPos, end, '\n');
91 
92  bool isSpdx = rx::regex_search(foundPos, j, regSpdxCopyright);
93  if (!isSpdx)
94  {
95  while (j != end)
96  {
97  string::const_iterator beginOfLine = j;
98  ++beginOfLine;
99  string::const_iterator endOfLine = find(beginOfLine, end, '\n');
100  if (rx::regex_search(beginOfLine, endOfLine, regSpdxCopyright)){
101  // Found end
102  break;
103  }
104  if (rx::regex_search(beginOfLine, endOfLine, regSimpleCopyright)
105  || !rx::regex_match(beginOfLine, endOfLine, regNonBlank))
106  {
107  // Found end
108  break;
109  }
110  j = endOfLine;
111  }
112  }
113 
114  string raw = string(foundPos, j);
115 
116  // SPDX-FileCopyrightText = [ ... ] array format for toml files.
117  if (isSpdx) {
118  static const rx::regex reArrayOpen("\\[\\s*$");
119  if (rx::regex_search(raw, reArrayOpen)) {
120  // Deactivate the array-header line itself
121  int startPos = foundPos - begin;
122  int endPos = j - begin;
123  out.push_back(match(startPos, endPos, copyrightType, false));
124 
125  // Walk subsequent lines extracting each quoted copyright element
126  static const rx::regex reQuoted("\"([^\"]+)\"");
127  static const rx::regex reClose("^[[:space:]]*\\]");
128  string::const_iterator arrPos = j;
129  bool arrayClosedCleanly = false;
130  while (arrPos != end) {
131  string::const_iterator lineStart = arrPos + 1;
132  if (lineStart >= end) break;
133  string::const_iterator lineEnd = find(lineStart, end, '\n');
134  string lineStr(lineStart, lineEnd);
135 
136  if (rx::regex_search(lineStr, reClose)) {
137  arrPos = lineEnd;
138  arrayClosedCleanly = true;
139  break;
140  }
141 
142  rx::smatch qm;
143  if (rx::regex_search(lineStr, qm, reQuoted)) {
144  string elemRaw = "SPDX-FileCopyrightText: " + qm[1].str();
145  CleanupResult er = Cleanup(elemRaw);
146  int eStart = lineStart - begin;
147  int eEnd = lineEnd - begin;
148  if (eEnd - eStart > 300) eEnd = eStart + 300;
149  if (er.disposition == CleanupResult::Disposition::KEEP)
150  out.push_back(match(eStart, eEnd, copyrightType));
151  else if (er.disposition == CleanupResult::Disposition::DEACTIVATE)
152  out.push_back(match(eStart, eEnd, copyrightType, false));
153  }
154  arrPos = lineEnd;
155  }
156  pos = arrayClosedCleanly ? arrPos : j;
157  continue;
158  }
159  }
160 
161  CleanupResult result = Cleanup(raw);
162 
163  if (result.disposition == CleanupResult::Disposition::DISCARD) {
164  // Definitively not a copyright
165  pos = j;
166  continue;
167  }
168 
169  int startPos = foundPos - begin;
170  int endPos = j - begin;
171 
172  if (result.disposition == CleanupResult::Disposition::DEACTIVATE) {
173  if (endPos - startPos > 300)
174  endPos = startPos + 300;
175  out.push_back(match(startPos, endPos, copyrightType, false));
176  pos = j;
177  continue;
178  }
179 
180  string& cleaned = result.content;
181  if (cleaned.size() > 300)
182  cleaned = cleaned.substr(0, 300);
183 
184  if (!isSpdx)
185  endPos = startPos + (int)cleaned.size();
186  if (endPos - startPos > 300)
187  endPos = startPos + 300;
188 
189  out.push_back(match(startPos, endPos, copyrightType));
190  pos = j;
191  }
192  else
193  {
194  // An exception: this is not a copyright statement: continue at the end of this statement
195  pos = results[0].second;
196  }
197  }
198 }
199 
200 CleanupResult hCopyrightScanner::Cleanup(const string &raw) const {
201  using Disposition = CleanupResult::Disposition;
202 
203  if (rx::regex_match(raw, regRemoveFileStmt)) {
204  return {"", Disposition::DEACTIVATE};
205  }
206 
207  string cleaned = raw;
208 
209  // regex_replace always allocates even with no match; guard with regex_search.
210  if (rx::regex_search(cleaned, regStripLicenseTrail))
211  cleaned = rx::regex_replace(cleaned, regStripLicenseTrail, string());
212  if (rx::regex_search(cleaned, regStripTrademarkTrail))
213  cleaned = rx::regex_replace(cleaned, regStripTrademarkTrail, string());
214  if (rx::regex_search(cleaned, regStripAllRightReserveTrail))
215  cleaned = rx::regex_replace(cleaned, regStripAllRightReserveTrail, string());
216  if (rx::regex_search(cleaned, regStripCopySymNonYear))
217  cleaned = rx::regex_replace(cleaned, regStripCopySymNonYear, string());
218 
219  // DEACTIVATE
220  if (rx::regex_search(cleaned, regExceptionTemplate) ||
221  rx::regex_search(cleaned, regExceptionBinaryNoise) ||
222  rx::regex_search(cleaned, regExceptionMeta) ||
223  rx::regex_search(cleaned, regExceptionCharNameRun)) {
224  return {"", Disposition::DEACTIVATE};
225  }
226 
227  // Limit exception checks to the first copyright context; use iterators to
228  // avoid a substring copy.
229  {
230  static const string kCopyright("copyright");
231  auto ciEq = [](unsigned char a, unsigned char b){
232  return ::tolower(a) == ::tolower(b);
233  };
234  auto dBegin = cleaned.cbegin();
235  auto dEnd = cleaned.cend();
236  auto it1 = std::search(dBegin, dEnd, kCopyright.cbegin(), kCopyright.cend(), ciEq);
237  if (it1 != dEnd) {
238  auto it2 = std::search(it1 + 9, dEnd, kCopyright.cbegin(), kCopyright.cend(), ciEq);
239  if (it2 != dEnd)
240  dEnd = it2;
241  }
242  if (rx::regex_search(dBegin, dEnd, regExceptionCopy) ||
243  rx::regex_search(dBegin, dEnd, regExceptionVerbFollow) ||
244  rx::regex_search(dBegin, dEnd, regExceptionAdjectivePrefix) ||
245  rx::regex_search(dBegin, dEnd, regExceptionPassive)) {
246  return {"", Disposition::DEACTIVATE};
247  }
248  }
249 
250  RemoveNoisePatterns(cleaned);
251  TrimPunctuation(cleaned);
252  NormalizeCopyright(cleaned);
253  StripSuffixes(cleaned);
254 
255  if (cleaned.empty()) {
256  return {"", Disposition::DISCARD};
257  }
258 
259  // Discard bare keyword: "copyright"(9) / "copyrights"(10) / "copyrighted"(11).
260  if (cleaned.size() >= 9 && cleaned.size() <= 11) {
261  string lower = cleaned;
262  transform(lower.begin(), lower.end(), lower.begin(), ::tolower);
263  if (lower == "copyright" || lower == "copyrights" || lower == "copyrighted") {
264  return {"", Disposition::DISCARD};
265  }
266  }
267 
268  return {cleaned, Disposition::KEEP};
269 }
270 
271 void hCopyrightScanner::TrimPunctuation(string &text) const{
272  static const string trimCharsAll = " \t,\'\"-:;&@!";
273  static const string trimStartOnly = ".>)]\\/";
274  static const string trimEndOnly = "<([\\/";
275 
276  size_t start = text.find_first_not_of(trimCharsAll);
277  size_t end = text.find_last_not_of(trimCharsAll);
278 
279  if (start == string::npos) {
280  text.clear();
281  return;
282  }
283 
284  text = text.substr(start, end - start + 1);
285 
286  // Count, then erase once.
287  size_t leading = 0;
288  while (leading < text.size() && trimStartOnly.find(text[leading]) != string::npos)
289  ++leading;
290  if (leading) text.erase(0, leading);
291 
292  while (!text.empty() && trimEndOnly.find(text.back()) != string::npos)
293  text.pop_back();
294 }
295 
296 void hCopyrightScanner::RemoveNoisePatterns(string& text) const{
297  static const vector<string> patterns = {
298  "<p>", "<a href", "date-of-software", "date-of-document",
299  " $ ", " ? ", "</a>", "( )", "()"
300  };
301 
302  for (const auto& word : patterns) {
303  size_t pos;
304  while ((pos = text.find(word)) != string::npos) {
305  text.replace(pos, word.length(), " ");
306  }
307  }
308 }
309 
310 void hCopyrightScanner::NormalizeCopyright(string& text) const {
311  static const vector<pair<string, string>> replacements = {
312  {"SPDX-FileCopyrightText", "Copyright"},
313  {"AssemblyCopyright", "Copyright"},
314  {"AppCopyright", "Copyright"},
315  {"JCOPYRIGHT", "Copyright"},
316  {"COPYRIGHT Copyright", "Copyright"},
317  {"Copyright Copyright", "Copyright"},
318  {"Copyright copyright", "Copyright"},
319  {"copyright copyright", "Copyright"},
320  {"copyright Copyright", "Copyright"},
321  {"copyright\"Copyright", "Copyright"},
322  {"copyright\" Copyright", "Copyright"}
323  };
324 
325  for (const auto& pair : replacements) {
326  const string& from = pair.first;
327  const string& to = pair.second;
328 
329  size_t pos;
330  while ((pos = text.find(from)) != string::npos) {
331  text.replace(pos, from.length(), to);
332  }
333  }
334 }
335 
336 void hCopyrightScanner::StripSuffixes(string& text) const{
337  static const vector<string> suffixes = {
338  "copyright", ",", "year", "parts", "0", "1", "author", "all", "some", "and"
339  };
340 
341  for (const auto& suffix : suffixes) {
342  if (text.length() > suffix.length() + 1 &&
343  text.size() >= suffix.size() &&
344  text.compare(text.size() - suffix.size(), suffix.size(), suffix) == 0)
345  {
346  text.erase(text.size() - suffix.size());
347  break;
348  }
349  }
350 }
351 
352 
353 
Provide regex using conf file.
void maybeLoad(const std::string &identity)
Check if identity already loaded in RegexMap, if not load them.
const char * getRegexValue(const std::string &name, const std::string &key)
Get the regex as string from the RegexMap.
rx::regex regNonBlank
Definition: copyscan.hpp:61
void ScanString(const string &s, list< match > &results) const
Scan a given string for copyright statements.
Definition: copyscan.cc:63
rx::regex regSpdxCopyright
Definition: copyscan.hpp:62
hCopyrightScanner()
Constructor for default hCopyrightScanner.
Definition: copyscan.cc:24
rx::regex regCopyright
Definition: copyscan.hpp:61
rx::regex regSimpleCopyright
Definition: copyscan.hpp:61
rx::regex regException
Definition: copyscan.hpp:61
int s
The socket that the CLI will use to communicate.
Definition: fo_cli.c:37
The main FOSSology C library.
start($application)
start the application Assumes application is restartable via /etc/init.d/<script>....
Definition: pkgConfig.php:1214
Outcome of the Cleanup() function.
Definition: copyscan.hpp:20
list_t type structure used to keep various lists. (e.g. there are multiple lists).
Definition: nomos.h:308
Store the results of a regex match.
Definition: scanners.hpp:28