8 #include "copyscan.hpp"
11 #include "regexConfProvider.hpp"
17 const string copyrightType(
"statement");
31 const auto icaseOptimize = rx::regex_constants::icase | rx::regex_constants::optimize;
40 regExceptionCopy = rx::regex(rcp.
getRegexValue(
"copyright",
"REG_EXCEPTION_COPY"), icaseOptimize);
41 regRemoveFileStmt = rx::regex(rcp.
getRegexValue(
"copyright",
"REG_REMOVE_FILE_STATEMENT"), icaseOptimize);
42 regStripLicenseTrail = rx::regex(rcp.
getRegexValue(
"copyright",
"REG_STRIP_LICENSE_TRAIL"), icaseOptimize);
43 regStripTrademarkTrail = rx::regex(rcp.
getRegexValue(
"copyright",
"REG_STRIP_TRADEMARK_TRAIL"), icaseOptimize);
44 regStripAllRightReserveTrail = rx::regex(rcp.
getRegexValue(
"copyright",
"REG_ALL_RIGHT_RESERVE_TRAIL"), icaseOptimize);
45 regExceptionVerbFollow = rx::regex(rcp.
getRegexValue(
"copyright",
"REG_EXCEPTION_VERB_FOLLOW"), icaseOptimize);
46 regExceptionAdjectivePrefix = rx::regex(rcp.
getRegexValue(
"copyright",
"REG_EXCEPTION_ADJECTIVE_PREFIX"), icaseOptimize);
47 regExceptionTemplate = rx::regex(rcp.
getRegexValue(
"copyright",
"REG_EXCEPTION_TEMPLATE"), icaseOptimize);
48 regExceptionPassive = rx::regex(rcp.
getRegexValue(
"copyright",
"REG_EXCEPTION_PASSIVE"), icaseOptimize);
49 regStripCopySymNonYear = rx::regex(rcp.
getRegexValue(
"copyright",
"REG_STRIP_COPYSYM_NONYEAR"), icaseOptimize);
50 regExceptionBinaryNoise = rx::regex(rcp.
getRegexValue(
"copyright",
"REG_EXCEPTION_BINARY_NOISE"), icaseOptimize);
51 regExceptionMeta = rx::regex(rcp.
getRegexValue(
"copyright",
"REG_EXCEPTION_META"), icaseOptimize);
52 regExceptionCharNameRun = rx::regex(rcp.
getRegexValue(
"copyright",
"REG_EXCEPTION_CHARNAME_RUN"), icaseOptimize);
66 string::const_iterator begin =
s.begin();
67 string::const_iterator pos = begin;
68 string::const_iterator end =
s.end();
76 string::const_iterator foundPos = results[0].first;
90 string::const_iterator j = find(foundPos, end,
'\n');
97 string::const_iterator beginOfLine = j;
99 string::const_iterator endOfLine = find(beginOfLine, end,
'\n');
105 || !rx::regex_match(beginOfLine, endOfLine,
regNonBlank))
114 string raw = string(foundPos, j);
118 static const rx::regex reArrayOpen(
"\\[\\s*$");
119 if (rx::regex_search(raw, reArrayOpen)) {
121 int startPos = foundPos - begin;
122 int endPos = j - begin;
123 out.push_back(
match(startPos, endPos, copyrightType,
false));
126 static const rx::regex reQuoted(
"\"([^\"]+)\"");
127 static const rx::regex reClose(
"^[[:space:]]*\\]");
128 string::const_iterator arrPos = j;
129 bool arrayClosedCleanly =
false;
130 while (arrPos != end) {
131 string::const_iterator lineStart = arrPos + 1;
132 if (lineStart >= end)
break;
133 string::const_iterator lineEnd = find(lineStart, end,
'\n');
134 string lineStr(lineStart, lineEnd);
136 if (rx::regex_search(lineStr, reClose)) {
138 arrayClosedCleanly =
true;
143 if (rx::regex_search(lineStr, qm, reQuoted)) {
144 string elemRaw =
"SPDX-FileCopyrightText: " + qm[1].str();
146 int eStart = lineStart - begin;
147 int eEnd = lineEnd - begin;
148 if (eEnd - eStart > 300) eEnd = eStart + 300;
149 if (er.disposition == CleanupResult::Disposition::KEEP)
150 out.push_back(
match(eStart, eEnd, copyrightType));
151 else if (er.disposition == CleanupResult::Disposition::DEACTIVATE)
152 out.push_back(
match(eStart, eEnd, copyrightType,
false));
156 pos = arrayClosedCleanly ? arrPos : j;
163 if (result.disposition == CleanupResult::Disposition::DISCARD) {
169 int startPos = foundPos - begin;
170 int endPos = j - begin;
172 if (result.disposition == CleanupResult::Disposition::DEACTIVATE) {
173 if (endPos - startPos > 300)
174 endPos = startPos + 300;
175 out.push_back(
match(startPos, endPos, copyrightType,
false));
180 string& cleaned = result.content;
181 if (cleaned.size() > 300)
182 cleaned = cleaned.substr(0, 300);
185 endPos = startPos + (int)cleaned.size();
186 if (endPos - startPos > 300)
187 endPos = startPos + 300;
189 out.push_back(
match(startPos, endPos, copyrightType));
195 pos = results[0].second;
200 CleanupResult hCopyrightScanner::Cleanup(
const string &raw)
const {
203 if (rx::regex_match(raw, regRemoveFileStmt)) {
204 return {
"", Disposition::DEACTIVATE};
207 string cleaned = raw;
210 if (rx::regex_search(cleaned, regStripLicenseTrail))
211 cleaned = rx::regex_replace(cleaned, regStripLicenseTrail,
string());
212 if (rx::regex_search(cleaned, regStripTrademarkTrail))
213 cleaned = rx::regex_replace(cleaned, regStripTrademarkTrail,
string());
214 if (rx::regex_search(cleaned, regStripAllRightReserveTrail))
215 cleaned = rx::regex_replace(cleaned, regStripAllRightReserveTrail,
string());
216 if (rx::regex_search(cleaned, regStripCopySymNonYear))
217 cleaned = rx::regex_replace(cleaned, regStripCopySymNonYear,
string());
220 if (rx::regex_search(cleaned, regExceptionTemplate) ||
221 rx::regex_search(cleaned, regExceptionBinaryNoise) ||
222 rx::regex_search(cleaned, regExceptionMeta) ||
223 rx::regex_search(cleaned, regExceptionCharNameRun)) {
224 return {
"", Disposition::DEACTIVATE};
230 static const string kCopyright(
"copyright");
231 auto ciEq = [](
unsigned char a,
unsigned char b){
232 return ::tolower(a) == ::tolower(b);
234 auto dBegin = cleaned.cbegin();
235 auto dEnd = cleaned.cend();
236 auto it1 = std::search(dBegin, dEnd, kCopyright.cbegin(), kCopyright.cend(), ciEq);
238 auto it2 = std::search(it1 + 9, dEnd, kCopyright.cbegin(), kCopyright.cend(), ciEq);
242 if (rx::regex_search(dBegin, dEnd, regExceptionCopy) ||
243 rx::regex_search(dBegin, dEnd, regExceptionVerbFollow) ||
244 rx::regex_search(dBegin, dEnd, regExceptionAdjectivePrefix) ||
245 rx::regex_search(dBegin, dEnd, regExceptionPassive)) {
246 return {
"", Disposition::DEACTIVATE};
250 RemoveNoisePatterns(cleaned);
251 TrimPunctuation(cleaned);
252 NormalizeCopyright(cleaned);
253 StripSuffixes(cleaned);
255 if (cleaned.empty()) {
256 return {
"", Disposition::DISCARD};
260 if (cleaned.size() >= 9 && cleaned.size() <= 11) {
261 string lower = cleaned;
262 transform(lower.begin(), lower.end(), lower.begin(), ::tolower);
263 if (lower ==
"copyright" || lower ==
"copyrights" || lower ==
"copyrighted") {
264 return {
"", Disposition::DISCARD};
268 return {cleaned, Disposition::KEEP};
271 void hCopyrightScanner::TrimPunctuation(
string &text)
const{
272 static const string trimCharsAll =
" \t,\'\"-:;&@!";
273 static const string trimStartOnly =
".>)]\\/";
274 static const string trimEndOnly =
"<([\\/";
276 size_t start = text.find_first_not_of(trimCharsAll);
277 size_t end = text.find_last_not_of(trimCharsAll);
279 if (
start == string::npos) {
288 while (leading < text.size() && trimStartOnly.find(text[leading]) != string::npos)
290 if (leading) text.erase(0, leading);
292 while (!text.empty() && trimEndOnly.find(text.back()) != string::npos)
296 void hCopyrightScanner::RemoveNoisePatterns(
string& text)
const{
297 static const vector<string> patterns = {
298 "<p>",
"<a href",
"date-of-software",
"date-of-document",
299 " $ ",
" ? ",
"</a>",
"( )",
"()"
302 for (
const auto& word : patterns) {
304 while ((pos = text.find(word)) != string::npos) {
305 text.replace(pos, word.length(),
" ");
310 void hCopyrightScanner::NormalizeCopyright(
string& text)
const {
311 static const vector<pair<string, string>> replacements = {
312 {
"SPDX-FileCopyrightText",
"Copyright"},
313 {
"AssemblyCopyright",
"Copyright"},
314 {
"AppCopyright",
"Copyright"},
315 {
"JCOPYRIGHT",
"Copyright"},
316 {
"COPYRIGHT Copyright",
"Copyright"},
317 {
"Copyright Copyright",
"Copyright"},
318 {
"Copyright copyright",
"Copyright"},
319 {
"copyright copyright",
"Copyright"},
320 {
"copyright Copyright",
"Copyright"},
321 {
"copyright\"Copyright",
"Copyright"},
322 {
"copyright\" Copyright",
"Copyright"}
325 for (
const auto& pair : replacements) {
326 const string& from = pair.first;
327 const string& to = pair.second;
330 while ((pos = text.find(from)) != string::npos) {
331 text.replace(pos, from.length(), to);
336 void hCopyrightScanner::StripSuffixes(
string& text)
const{
337 static const vector<string> suffixes = {
338 "copyright",
",",
"year",
"parts",
"0",
"1",
"author",
"all",
"some",
"and"
341 for (
const auto& suffix : suffixes) {
342 if (text.length() > suffix.length() + 1 &&
343 text.size() >= suffix.size() &&
344 text.compare(text.size() - suffix.size(), suffix.size(), suffix) == 0)
346 text.erase(text.size() - suffix.size());
Provide regex using conf file.
void maybeLoad(const std::string &identity)
Check if identity already loaded in RegexMap, if not load them.
const char * getRegexValue(const std::string &name, const std::string &key)
Get the regex as string from the RegexMap.
void ScanString(const string &s, list< match > &results) const
Scan a given string for copyright statements.
rx::regex regSpdxCopyright
hCopyrightScanner()
Constructor for default hCopyrightScanner.
rx::regex regSimpleCopyright
int s
The socket that the CLI will use to communicate.
The main FOSSology C library.
start($application)
start the application Assumes application is restartable via /etc/init.d/<script>....
Outcome of the Cleanup() function.
list_t type structure used to keep various lists. (e.g. there are multiple lists).
Store the results of a regex match.