8 #include "copyscan.hpp"
11 #include "regexConfProvider.hpp"
17 const string copyrightType(
"statement");
30 rx::regex_constants::icase);
33 rx::regex_constants::icase);
37 rx::regex_constants::icase);
39 rx::regex_constants::icase);
42 regExceptionCopy = rx::regex(rcp.
getRegexValue(
"copyright",
"REG_EXCEPTION_COPY"),
43 rx::regex_constants::icase);
44 regRemoveFileStmt = rx::regex(rcp.
getRegexValue(
"copyright",
"REG_REMOVE_FILE_STATEMENT"),
45 rx::regex_constants::icase);
46 regStripLicenseTrail = rx::regex(rcp.
getRegexValue(
"copyright",
"REG_STRIP_LICENSE_TRAIL"),
47 rx::regex_constants::icase);
48 regStripTrademarkTrail = rx::regex(rcp.
getRegexValue(
"copyright",
"REG_STRIP_TRADEMARK_TRAIL"),
49 rx::regex_constants::icase);
50 regStripAllRightReserveTrail = rx::regex(rcp.
getRegexValue(
"copyright",
"REG_ALL_RIGHT_RESERVE_TRAIL"),
51 rx::regex_constants::icase);
52 regExceptionVerbFollow = rx::regex(rcp.
getRegexValue(
"copyright",
"REG_EXCEPTION_VERB_FOLLOW"),
53 rx::regex_constants::icase);
54 regExceptionAdjectivePrefix = rx::regex(rcp.
getRegexValue(
"copyright",
"REG_EXCEPTION_ADJECTIVE_PREFIX"),
55 rx::regex_constants::icase);
56 regExceptionTemplate = rx::regex(rcp.
getRegexValue(
"copyright",
"REG_EXCEPTION_TEMPLATE"),
57 rx::regex_constants::icase);
58 regExceptionPassive = rx::regex(rcp.
getRegexValue(
"copyright",
"REG_EXCEPTION_PASSIVE"),
59 rx::regex_constants::icase);
60 regStripCopySymNonYear = rx::regex(rcp.
getRegexValue(
"copyright",
"REG_STRIP_COPYSYM_NONYEAR"),
61 rx::regex_constants::icase);
62 regExceptionBinaryNoise = rx::regex(rcp.
getRegexValue(
"copyright",
"REG_EXCEPTION_BINARY_NOISE"),
63 rx::regex_constants::icase);
64 regExceptionMeta = rx::regex(rcp.
getRegexValue(
"copyright",
"REG_EXCEPTION_META"),
65 rx::regex_constants::icase);
66 regExceptionCharNameRun = rx::regex(rcp.
getRegexValue(
"copyright",
"REG_EXCEPTION_CHARNAME_RUN"),
67 rx::regex_constants::icase);
81 string::const_iterator begin =
s.begin();
82 string::const_iterator pos = begin;
83 string::const_iterator end =
s.end();
91 string::const_iterator foundPos = results[0].first;
105 string::const_iterator j = find(foundPos, end,
'\n');
108 string::const_iterator beginOfLine = j;
110 string::const_iterator endOfLine = find(beginOfLine, end,
'\n');
116 || !rx::regex_match(beginOfLine, endOfLine,
regNonBlank))
123 string raw = string(foundPos, j);
126 if (result.disposition == CleanupResult::Disposition::DISCARD) {
132 if (result.disposition == CleanupResult::Disposition::DEACTIVATE) {
134 out.push_back(
match(foundPos - begin, j - begin, copyrightType,
false));
139 string& cleaned = result.content;
140 if (cleaned.size() > 300)
141 cleaned = cleaned.substr(0, 300);
143 out.push_back(
match(foundPos - begin, (foundPos - begin) + cleaned.size(), copyrightType));
149 pos = results[0].second;
154 CleanupResult hCopyrightScanner::Cleanup(
const string &raw)
const {
157 if (rx::regex_match(raw, regRemoveFileStmt)) {
158 return {
"", Disposition::DISCARD};
161 string cleaned = raw;
162 cleaned = rx::regex_replace(cleaned, regStripLicenseTrail,
"");
163 cleaned = rx::regex_replace(cleaned, regStripTrademarkTrail,
"");
164 cleaned = rx::regex_replace(cleaned, regStripAllRightReserveTrail,
"");
165 cleaned = rx::regex_replace(cleaned, regStripCopySymNonYear,
"");
168 if (rx::regex_search(cleaned, regExceptionTemplate) ||
169 rx::regex_search(cleaned, regExceptionBinaryNoise) ||
170 rx::regex_search(cleaned, regExceptionMeta) ||
171 rx::regex_search(cleaned, regExceptionCharNameRun)) {
172 return {
"", Disposition::DISCARD};
176 if (rx::regex_search(cleaned, regExceptionCopy) ||
177 rx::regex_search(cleaned, regExceptionVerbFollow) ||
178 rx::regex_search(cleaned, regExceptionAdjectivePrefix) ||
179 rx::regex_search(cleaned, regExceptionPassive)) {
180 return {
"", Disposition::DEACTIVATE};
183 RemoveNoisePatterns(cleaned);
184 TrimPunctuation(cleaned);
185 NormalizeCopyright(cleaned);
186 StripSuffixes(cleaned);
188 if (cleaned.empty()) {
189 return {
"", Disposition::DISCARD};
192 return {cleaned, Disposition::KEEP};
195 void hCopyrightScanner::TrimPunctuation(
string &text)
const{
196 const string trimCharsAll =
",\'\"-:;&@!";
197 const string trimStartOnly =
".>)]\\/";
198 const string trimEndOnly =
"<([\\/";
200 size_t start = text.find_first_not_of(trimCharsAll);
201 size_t end = text.find_last_not_of(trimCharsAll);
203 if (
start == string::npos) {
210 while (!text.empty() && trimStartOnly.find(text.front()) != string::npos) {
214 while (!text.empty() && trimEndOnly.find(text.back()) != string::npos) {
219 void hCopyrightScanner::RemoveNoisePatterns(
string& text)
const{
220 const vector<string> patterns = {
221 "<p>",
"<a href",
"date-of-software",
"date-of-document",
222 " $ ",
" ? ",
"</a>",
"( )",
"()"
225 for (
const auto& word : patterns) {
227 while ((pos = text.find(word)) != string::npos) {
228 text.replace(pos, word.length(),
" ");
233 void hCopyrightScanner::NormalizeCopyright(
string& text)
const {
234 const vector<pair<string, string>> replacements = {
235 {
"SPDX-FileCopyrightText",
"Copyright"},
236 {
"AssemblyCopyright",
"Copyright"},
237 {
"AppCopyright",
"Copyright"},
238 {
"JCOPYRIGHT",
"Copyright"},
239 {
"COPYRIGHT Copyright",
"Copyright"},
240 {
"Copyright Copyright",
"Copyright"},
241 {
"Copyright copyright",
"Copyright"},
242 {
"copyright copyright",
"Copyright"},
243 {
"copyright Copyright",
"Copyright"},
244 {
"copyright\"Copyright",
"Copyright"},
245 {
"copyright\" Copyright",
"Copyright"}
248 for (
const auto& pair : replacements) {
249 const string& from = pair.first;
250 const string& to = pair.second;
253 while ((pos = text.find(from)) != string::npos) {
254 text.replace(pos, from.length(), to);
259 void hCopyrightScanner::StripSuffixes(
string& text)
const{
260 const vector<string> suffixes = {
261 "copyright",
",",
"year",
"parts",
"0",
"1",
"author",
"all",
"some",
"and"
264 for (
const auto& suffix : suffixes) {
265 if (text.length() > suffix.length() + 1 &&
266 text.size() >= suffix.size() &&
267 text.compare(text.size() - suffix.size(), suffix.size(), suffix) == 0)
269 text.erase(text.size() - suffix.size());
Provide regex using conf file.
void maybeLoad(const std::string &identity)
Check if identity already loaded in RegexMap, if not load them.
const char * getRegexValue(const std::string &name, const std::string &key)
Get the regex as string from the RegexMap.
void ScanString(const string &s, list< match > &results) const
Scan a given string for copyright statements.
rx::regex regSpdxCopyright
hCopyrightScanner()
Constructor for default hCopyrightScanner.
rx::regex regSimpleCopyright
int s
The socket that the CLI will use to communicate.
The main FOSSology C library.
start($application)
start the application Assumes application is restartable via /etc/init.d/<script>....
Outcome of the Cleanup() function.
list_t type structure used to keep various lists. (e.g. there are multiple lists).
Store the results of a regex match.