FOSSology  4.7.0-rc1
Open Source License Compliance by Open Source Software
copyscan.cc
1 /*
2  SPDX-FileCopyrightText: © 2015,2022, Siemens AG
3  Author: Florian Krügel
4 
5  SPDX-License-Identifier: GPL-2.0-only
6 */
7 
8 #include "copyscan.hpp"
9 #include <cctype>
10 #include <algorithm>
11 #include "regexConfProvider.hpp"
12 
13 extern "C" {
14 #include "libfossology.h"
15 }
16 
17 const string copyrightType("statement");
25 {
27  rcp.maybeLoad("copyright");
28 
29  regCopyright = rx::regex(rcp.getRegexValue("copyright","REG_COPYRIGHT"),
30  rx::regex_constants::icase);
31 
32  regException = rx::regex(rcp.getRegexValue("copyright","REG_EXCEPTION"),
33  rx::regex_constants::icase);
34  regNonBlank = rx::regex(rcp.getRegexValue("copyright","REG_NON_BLANK"));
35 
36  regSimpleCopyright = rx::regex(rcp.getRegexValue("copyright","REG_SIMPLE_COPYRIGHT"),
37  rx::regex_constants::icase);
38  regSpdxCopyright = rx::regex(rcp.getRegexValue("copyright","REG_SPDX_COPYRIGHT"),
39  rx::regex_constants::icase);
40 
41  // Cleanup
42  regExceptionCopy = rx::regex(rcp.getRegexValue("copyright","REG_EXCEPTION_COPY"),
43  rx::regex_constants::icase);
44  regRemoveFileStmt = rx::regex(rcp.getRegexValue("copyright","REG_REMOVE_FILE_STATEMENT"),
45  rx::regex_constants::icase);
46  regStripLicenseTrail = rx::regex(rcp.getRegexValue("copyright", "REG_STRIP_LICENSE_TRAIL"),
47  rx::regex_constants::icase);
48  regStripTrademarkTrail = rx::regex(rcp.getRegexValue("copyright", "REG_STRIP_TRADEMARK_TRAIL"),
49  rx::regex_constants::icase);
50  regStripAllRightReserveTrail = rx::regex(rcp.getRegexValue("copyright", "REG_ALL_RIGHT_RESERVE_TRAIL"),
51  rx::regex_constants::icase);
52  regExceptionVerbFollow = rx::regex(rcp.getRegexValue("copyright", "REG_EXCEPTION_VERB_FOLLOW"),
53  rx::regex_constants::icase);
54  regExceptionAdjectivePrefix = rx::regex(rcp.getRegexValue("copyright", "REG_EXCEPTION_ADJECTIVE_PREFIX"),
55  rx::regex_constants::icase);
56  regExceptionTemplate = rx::regex(rcp.getRegexValue("copyright", "REG_EXCEPTION_TEMPLATE"),
57  rx::regex_constants::icase);
58  regExceptionPassive = rx::regex(rcp.getRegexValue("copyright", "REG_EXCEPTION_PASSIVE"),
59  rx::regex_constants::icase);
60  regStripCopySymNonYear = rx::regex(rcp.getRegexValue("copyright", "REG_STRIP_COPYSYM_NONYEAR"),
61  rx::regex_constants::icase);
62  regExceptionBinaryNoise = rx::regex(rcp.getRegexValue("copyright", "REG_EXCEPTION_BINARY_NOISE"),
63  rx::regex_constants::icase);
64  regExceptionMeta = rx::regex(rcp.getRegexValue("copyright", "REG_EXCEPTION_META"),
65  rx::regex_constants::icase);
66  regExceptionCharNameRun = rx::regex(rcp.getRegexValue("copyright", "REG_EXCEPTION_CHARNAME_RUN"),
67  rx::regex_constants::icase);
68 }
69 
78 void hCopyrightScanner::ScanString(const string& s, list<match>& out) const
79 {
80 
81  string::const_iterator begin = s.begin();
82  string::const_iterator pos = begin;
83  string::const_iterator end = s.end();
84  while (pos != end)
85  {
86  // Find potential copyright statement
87  rx::smatch results;
88  if (!rx::regex_search(pos, end, results, regCopyright))
89  // No further copyright statement found
90  break;
91  string::const_iterator foundPos = results[0].first;
92 
93  if (!rx::regex_match(foundPos, end, regException))
94  {
105  string::const_iterator j = find(foundPos, end, '\n');
106  while (j != end)
107  {
108  string::const_iterator beginOfLine = j;
109  ++beginOfLine;
110  string::const_iterator endOfLine = find(beginOfLine, end, '\n');
111  if (rx::regex_search(beginOfLine, endOfLine, regSpdxCopyright)){
112  // Found end
113  break;
114  }
115  if (rx::regex_search(beginOfLine, endOfLine, regSimpleCopyright)
116  || !rx::regex_match(beginOfLine, endOfLine, regNonBlank))
117  {
118  // Found end
119  break;
120  }
121  j = endOfLine;
122  }
123  string raw = string(foundPos, j);
124  CleanupResult result = Cleanup(raw);
125 
126  if (result.disposition == CleanupResult::Disposition::DISCARD) {
127  // Definitively not a copyright
128  pos = j;
129  continue;
130  }
131 
132  if (result.disposition == CleanupResult::Disposition::DEACTIVATE) {
133  // deactivated copyright section
134  out.push_back(match(foundPos - begin, j - begin, copyrightType, false));
135  pos = j;
136  continue;
137  }
138 
139  string& cleaned = result.content;
140  if (cleaned.size() > 300)
141  cleaned = cleaned.substr(0, 300);
142 
143  out.push_back(match(foundPos - begin, (foundPos - begin) + cleaned.size(), copyrightType));
144  pos = j;
145  }
146  else
147  {
148  // An exception: this is not a copyright statement: continue at the end of this statement
149  pos = results[0].second;
150  }
151  }
152 }
153 
154 CleanupResult hCopyrightScanner::Cleanup(const string &raw) const {
155  using Disposition = CleanupResult::Disposition;
156 
157  if (rx::regex_match(raw, regRemoveFileStmt)) {
158  return {"", Disposition::DISCARD};
159  }
160 
161  string cleaned = raw;
162  cleaned = rx::regex_replace(cleaned, regStripLicenseTrail, "");
163  cleaned = rx::regex_replace(cleaned, regStripTrademarkTrail, "");
164  cleaned = rx::regex_replace(cleaned, regStripAllRightReserveTrail, "");
165  cleaned = rx::regex_replace(cleaned, regStripCopySymNonYear, "");
166 
167  // DISCARD
168  if (rx::regex_search(cleaned, regExceptionTemplate) ||
169  rx::regex_search(cleaned, regExceptionBinaryNoise) ||
170  rx::regex_search(cleaned, regExceptionMeta) ||
171  rx::regex_search(cleaned, regExceptionCharNameRun)) {
172  return {"", Disposition::DISCARD};
173  }
174 
175  // DEACTIVATE
176  if (rx::regex_search(cleaned, regExceptionCopy) ||
177  rx::regex_search(cleaned, regExceptionVerbFollow) ||
178  rx::regex_search(cleaned, regExceptionAdjectivePrefix) ||
179  rx::regex_search(cleaned, regExceptionPassive)) {
180  return {"", Disposition::DEACTIVATE};
181  }
182 
183  RemoveNoisePatterns(cleaned);
184  TrimPunctuation(cleaned);
185  NormalizeCopyright(cleaned);
186  StripSuffixes(cleaned);
187 
188  if (cleaned.empty()) {
189  return {"", Disposition::DISCARD};
190  }
191 
192  return {cleaned, Disposition::KEEP};
193 }
194 
195 void hCopyrightScanner::TrimPunctuation(string &text) const{
196  const string trimCharsAll = ",\'\"-:;&@!";
197  const string trimStartOnly = ".>)]\\/";
198  const string trimEndOnly = "<([\\/";
199 
200  size_t start = text.find_first_not_of(trimCharsAll);
201  size_t end = text.find_last_not_of(trimCharsAll);
202 
203  if (start == string::npos) {
204  text.clear();
205  return;
206  }
207 
208  text = text.substr(start, end - start + 1);
209 
210  while (!text.empty() && trimStartOnly.find(text.front()) != string::npos) {
211  text.erase(0, 1);
212  }
213 
214  while (!text.empty() && trimEndOnly.find(text.back()) != string::npos) {
215  text.pop_back();
216  }
217 }
218 
219 void hCopyrightScanner::RemoveNoisePatterns(string& text) const{
220  const vector<string> patterns = {
221  "<p>", "<a href", "date-of-software", "date-of-document",
222  " $ ", " ? ", "</a>", "( )", "()"
223  };
224 
225  for (const auto& word : patterns) {
226  size_t pos;
227  while ((pos = text.find(word)) != string::npos) {
228  text.replace(pos, word.length(), " ");
229  }
230  }
231 }
232 
233 void hCopyrightScanner::NormalizeCopyright(string& text) const {
234  const vector<pair<string, string>> replacements = {
235  {"SPDX-FileCopyrightText", "Copyright"},
236  {"AssemblyCopyright", "Copyright"},
237  {"AppCopyright", "Copyright"},
238  {"JCOPYRIGHT", "Copyright"},
239  {"COPYRIGHT Copyright", "Copyright"},
240  {"Copyright Copyright", "Copyright"},
241  {"Copyright copyright", "Copyright"},
242  {"copyright copyright", "Copyright"},
243  {"copyright Copyright", "Copyright"},
244  {"copyright\"Copyright", "Copyright"},
245  {"copyright\" Copyright", "Copyright"}
246  };
247 
248  for (const auto& pair : replacements) {
249  const string& from = pair.first;
250  const string& to = pair.second;
251 
252  size_t pos;
253  while ((pos = text.find(from)) != string::npos) {
254  text.replace(pos, from.length(), to);
255  }
256  }
257 }
258 
259 void hCopyrightScanner::StripSuffixes(string& text) const{
260  const vector<string> suffixes = {
261  "copyright", ",", "year", "parts", "0", "1", "author", "all", "some", "and"
262  };
263 
264  for (const auto& suffix : suffixes) {
265  if (text.length() > suffix.length() + 1 &&
266  text.size() >= suffix.size() &&
267  text.compare(text.size() - suffix.size(), suffix.size(), suffix) == 0)
268  {
269  text.erase(text.size() - suffix.size());
270  break;
271  }
272  }
273 }
274 
275 
276 
Provide regex using conf file.
void maybeLoad(const std::string &identity)
Check if identity already loaded in RegexMap, if not load them.
const char * getRegexValue(const std::string &name, const std::string &key)
Get the regex as string from the RegexMap.
rx::regex regNonBlank
Definition: copyscan.hpp:61
void ScanString(const string &s, list< match > &results) const
Scan a given string for copyright statements.
Definition: copyscan.cc:78
rx::regex regSpdxCopyright
Definition: copyscan.hpp:62
hCopyrightScanner()
Constructor for default hCopyrightScanner.
Definition: copyscan.cc:24
rx::regex regCopyright
Definition: copyscan.hpp:61
rx::regex regSimpleCopyright
Definition: copyscan.hpp:61
rx::regex regException
Definition: copyscan.hpp:61
int s
The socket that the CLI will use to communicate.
Definition: fo_cli.c:37
The main FOSSology C library.
start($application)
start the application Assumes application is restartable via /etc/init.d/<script>....
Definition: pkgConfig.php:1214
Outcome of the Cleanup() function.
Definition: copyscan.hpp:20
list_t type structure used to keep various lists. (e.g. there are multiple lists).
Definition: nomos.h:308
Store the results of a regex match.
Definition: scanners.hpp:28