FOSSology  4.5.1
Open Source License Compliance by Open Source Software
copyscan.cc
1 /*
2  SPDX-FileCopyrightText: © 2015,2022, Siemens AG
3  Author: Florian Krügel
4 
5  SPDX-License-Identifier: GPL-2.0-only
6 */
7 
8 #include "copyscan.hpp"
9 #include <cctype>
10 #include <algorithm>
11 #include "regexConfProvider.hpp"
12 
13 const string copyrightType("statement");
21 {
23  rcp.maybeLoad("copyright");
24 
25  regCopyright = rx::regex(rcp.getRegexValue("copyright","REG_COPYRIGHT"),
26  rx::regex_constants::icase);
27 
28  regException = rx::regex(rcp.getRegexValue("copyright","REG_EXCEPTION"),
29  rx::regex_constants::icase);
30  regNonBlank = rx::regex(rcp.getRegexValue("copyright","REG_NON_BLANK"));
31 
32  regSimpleCopyright = rx::regex(rcp.getRegexValue("copyright","REG_SIMPLE_COPYRIGHT"),
33  rx::regex_constants::icase);
34  regSpdxCopyright = rx::regex(rcp.getRegexValue("copyright","REG_SPDX_COPYRIGHT"),
35  rx::regex_constants::icase);
36 
37  // Cleanup
38  regExceptionCopy = rx::regex(rcp.getRegexValue("copyright","REG_EXCEPTION_COPY"),
39  rx::regex_constants::icase);
40  regRemoveFileStmt = rx::regex(rcp.getRegexValue("copyright","REG_REMOVE_FILE_STATEMENT"),
41  rx::regex_constants::icase);
42  regStripLicenseTrail = rx::regex(rcp.getRegexValue("copyright", "REG_STRIP_LICENSE_TRAIL"),
43  rx::regex_constants::icase);
44  regStripTrademarkTrail = rx::regex(rcp.getRegexValue("copyright", "REG_STRIP_TRADEMARK_TRAIL"),
45  rx::regex_constants::icase);
46  regStripAllRightReserveTrail = rx::regex(rcp.getRegexValue("copyright", "REG_ALL_RIGHT_RESERVE_TRAIL"),
47  rx::regex_constants::icase);
48 
49 }
50 
59 void hCopyrightScanner::ScanString(const string& s, list<match>& out) const
60 {
61 
62  string::const_iterator begin = s.begin();
63  string::const_iterator pos = begin;
64  string::const_iterator end = s.end();
65  while (pos != end)
66  {
67  // Find potential copyright statement
68  rx::smatch results;
69  if (!rx::regex_search(pos, end, results, regCopyright))
70  // No further copyright statement found
71  break;
72  string::const_iterator foundPos = results[0].first;
73 
74  if (!rx::regex_match(foundPos, end, regException))
75  {
86  string::const_iterator j = find(foundPos, end, '\n');
87  while (j != end)
88  {
89  string::const_iterator beginOfLine = j;
90  ++beginOfLine;
91  string::const_iterator endOfLine = find(beginOfLine, end, '\n');
92  if (rx::regex_search(beginOfLine, endOfLine, regSpdxCopyright)){
93  // Found end
94  break;
95  }
96  if (rx::regex_search(beginOfLine, endOfLine, regSimpleCopyright)
97  || !rx::regex_match(beginOfLine, endOfLine, regNonBlank))
98  {
99  // Found end
100  break;
101  }
102  j = endOfLine;
103  }
104  string raw = string(foundPos, j);
105  string cleaned = Cleanup(raw);
106 
107  if (cleaned.empty()) {
108  pos = j;
109  continue;
110  }
111 
112  if (cleaned.size() > 300)
113  cleaned = cleaned.substr(0, 300);
114 
115  out.push_back(match(foundPos - begin, (foundPos - begin) + cleaned.size(), copyrightType));
116  pos = j;
117  }
118  else
119  {
120  // An exception: this is not a copyright statement: continue at the end of this statement
121  pos = results[0].second;
122  }
123  }
124 }
125 
126 string hCopyrightScanner::Cleanup(const string &raw) const {
127  if (rx::regex_search(raw, regExceptionCopy)) {
128  return "";
129  }
130  if (rx::regex_match(raw, regRemoveFileStmt)) {
131  return "";
132  }
133  string cleaned = raw;
134  cleaned = rx::regex_replace(cleaned, regStripLicenseTrail, "");
135  cleaned = rx::regex_replace(cleaned, regStripTrademarkTrail, "");
136  cleaned = rx::regex_replace(cleaned, regStripAllRightReserveTrail, "");
137 
138  RemoveNoisePatterns(cleaned);
139  TrimPunctuation(cleaned);
140  NormalizeCopyright(cleaned);
141  StripSuffixes(cleaned);
142 
143  return cleaned;
144 }
145 
146 void hCopyrightScanner::TrimPunctuation(string &text) const{
147  const string trimCharsAll = ",\'\"-:;&@!";
148  const string trimStartOnly = ".>)]\\/";
149  const string trimEndOnly = "<([\\/";
150 
151  size_t start = text.find_first_not_of(trimCharsAll);
152  size_t end = text.find_last_not_of(trimCharsAll);
153 
154  if (start == string::npos) {
155  text.clear();
156  return;
157  }
158 
159  text = text.substr(start, end - start + 1);
160 
161  while (!text.empty() && trimStartOnly.find(text.front()) != string::npos) {
162  text.erase(0, 1);
163  }
164 
165  while (!text.empty() && trimEndOnly.find(text.back()) != string::npos) {
166  text.pop_back();
167  }
168 }
169 
170 void hCopyrightScanner::RemoveNoisePatterns(string& text) const{
171  const vector<string> patterns = {
172  "<p>", "<a href", "date-of-software", "date-of-document",
173  " $ ", " ? ", "</a>", "( )", "()"
174  };
175 
176  for (const auto& word : patterns) {
177  size_t pos;
178  while ((pos = text.find(word)) != string::npos) {
179  text.replace(pos, word.length(), " ");
180  }
181  }
182 }
183 
184 void hCopyrightScanner::NormalizeCopyright(string& text) const {
185  const vector<pair<string, string>> replacements = {
186  {"SPDX-FileCopyrightText", "Copyright"},
187  {"AssemblyCopyright", "Copyright"},
188  {"AppCopyright", "Copyright"},
189  {"JCOPYRIGHT", "Copyright"},
190  {"COPYRIGHT Copyright", "Copyright"},
191  {"Copyright Copyright", "Copyright"},
192  {"Copyright copyright", "Copyright"},
193  {"copyright copyright", "Copyright"},
194  {"copyright Copyright", "Copyright"},
195  {"copyright\"Copyright", "Copyright"},
196  {"copyright\" Copyright", "Copyright"}
197  };
198 
199  for (const auto& pair : replacements) {
200  const string& from = pair.first;
201  const string& to = pair.second;
202 
203  size_t pos;
204  while ((pos = text.find(from)) != string::npos) {
205  text.replace(pos, from.length(), to);
206  }
207  }
208 }
209 
210 void hCopyrightScanner::StripSuffixes(string& text) const{
211  const vector<string> suffixes = {
212  "copyright", ",", "year", "parts", "0", "1", "author", "all", "some", "and"
213  };
214 
215  for (const auto& suffix : suffixes) {
216  if (text.length() > suffix.length() + 1 &&
217  text.size() >= suffix.size() &&
218  text.compare(text.size() - suffix.size(), suffix.size(), suffix) == 0)
219  {
220  text.erase(text.size() - suffix.size());
221  break;
222  }
223  }
224 }
225 
226 
227 
Provide regex using conf file.
void maybeLoad(const std::string &identity)
Check if identity already loaded in RegexMap, if not load them.
const char * getRegexValue(const std::string &name, const std::string &key)
Get the regex as string from the RegexMap.
rx::regex regNonBlank
Definition: copyscan.hpp:43
void ScanString(const string &s, list< match > &results) const
Scan a given string for copyright statements.
Definition: copyscan.cc:59
rx::regex regSpdxCopyright
Definition: copyscan.hpp:44
hCopyrightScanner()
Constructor for default hCopyrightScanner.
Definition: copyscan.cc:20
rx::regex regCopyright
Definition: copyscan.hpp:43
rx::regex regSimpleCopyright
Definition: copyscan.hpp:43
rx::regex regException
Definition: copyscan.hpp:43
int s
The socket that the CLI will use to communicate.
Definition: fo_cli.c:37
start($application)
start the application Assumes application is restartable via /etc/init.d/<script>....
Definition: pkgConfig.php:1214
list_t type structure used to keep various lists. (e.g. there are multiple lists).
Definition: nomos.h:308
Store the results of a regex match.
Definition: scanners.hpp:28