FOSSology  4.7.1
Open Source License Compliance by Open Source Software
test_scanners.cc
1 /*
2  SPDX-FileCopyrightText: © 2014-15, 2018 Siemens AG
3 
4  SPDX-License-Identifier: GPL-2.0-only
5 */
6 
7 #include <cppunit/TestFixture.h>
8 #include <cppunit/extensions/HelperMacros.h>
9 
10 #include "regex.hpp"
11 #include "regscan.hpp"
12 #include "copyrightUtils.hpp"
13 #include "cleanEntries.hpp"
14 #include <list>
15 #include <cstring>
16 #include <ostream>
17 
18 using namespace std;
19 
25 ostream& operator<<(ostream& out, const list<match>& l)
26 {
27  for (auto m = l.begin(); m != l.end(); ++m)
28  out << '[' << m->start << ':' << m->end << ':' << m->type << ']';
29  return out;
30 }
31 
35 const char testContent[] = "© 2007 Hugh Jackman\n\n"
36  "Copyright 2004 my company\n\n"
37  "Copyrights by any strange people\n\n"
38  "(C) copyright 2007-2011, 2013 my favourite company Google\n\n"
39  "(C) 2007-2011, 2013 my favourite company Google\n\n"
40  "if (c) { return -1 } \n\n"
41  "Written by: me, myself and Irene.\n\n"
42  "Authors all the people at ABC\n\n"
43  "<author>Author1</author>"
44  "<head>All the people</head>"
45  "<author>Author1 Author2 Author3</author>"
46  "<author>Author4</author><b>example</b>"
47  "Apache\n\n"
48  "This file is protected under pants 1 , 2 ,3\n\n"
49  "Do not modify this document\n\n"
50  "the shuttle is a space vehicle designed by NASA\n\n"
51  "visit http://mysite.org/FAQ or write to info@mysite.org\n\n"
52  "maintained by benjamin drieu <benj@debian.org>\n\n"
53  "* Copyright (c) 1989, 1993\n" // Really just one newline here!
54  "* The Regents of the University of California. All rights reserved.\n\n"
55  "to be licensed as a whole"
56  "/* Most of the following tests are stolen from RCS 5.7's src/conf.sh. */";
57 
58 class scannerTestSuite : public CPPUNIT_NS :: TestFixture {
59  CPPUNIT_TEST_SUITE (scannerTestSuite);
60  CPPUNIT_TEST (copyscannerTest);
61  CPPUNIT_TEST (copyscannerDotPrefixedNameTest);
62  CPPUNIT_TEST (copyscannerBareKeywordDiscardTest);
63  CPPUNIT_TEST (copyscannerCopyrightedStatementTest);
64  CPPUNIT_TEST (copyscannerBinaryNoiseTest);
65  CPPUNIT_TEST (copyscannerSpdxFullLineTest);
66  CPPUNIT_TEST (copyscannerSpdxArrayTest);
67  CPPUNIT_TEST (copyscannerProseExceptionTest);
68  CPPUNIT_TEST (regAuthorTest);
69  CPPUNIT_TEST (regIpraTest);
70  CPPUNIT_TEST (regEccTest);
71  CPPUNIT_TEST (regUrlTest);
72  CPPUNIT_TEST (regEmailTest);
73  CPPUNIT_TEST (regKeywordTest);
74  CPPUNIT_TEST (cleanEntries);
75 
76  CPPUNIT_TEST_SUITE_END ();
77 
78 private:
86  void scannerTest (const scanner& sc, const char* content, const string& type, list<const char*> expectedStrings)
87  {
88  list<match> matches;
89  list<match> expected;
90  sc.ScanString(content, matches);
91 
92  for (auto s = expectedStrings.begin(); s != expectedStrings.end(); ++s)
93  {
94  const char * p = strstr(content, *s);
95  if (p)
96  {
97  int pos = p - content;
98  expected.push_back(match(pos, pos+strlen(*s), type));
99  }
100  // else: expected string is not contained in original string
101  }
102  CPPUNIT_ASSERT_EQUAL(expected, matches);
103  }
104 
105 protected:
114  {
115  // Test copyright matcher
117 
118  scannerTest(sc, testContent, "statement", { "© 2007 Hugh Jackman",
119  "Copyright 2004 my company",
120  "Copyrights by any strange people",
121  "(C) copyright 2007-2011, 2013 my favourite company Google",
122  "(C) 2007-2011, 2013 my favourite company Google",
123  "Copyright (c) 1989, 1993\n* The Regents of the University of California."
124  });
125  }
126 
132  {
134  // © followed by a dot-prefixed name (.NET) must be fully preserved
135  const char* content1 = "Copyright \xc2\xa9 .NET Foundation and contributors\n";
136  scannerTest(sc, content1, "statement",
137  {"Copyright \xc2\xa9 .NET Foundation and contributors"});
138 
139  // Standalone © (no 'copyright' keyword) with dot-prefixed name
140  const char* content2 = "\xc2\xa9 .NET Foundation and contributors\n";
141  scannerTest(sc, content2, "statement",
142  {"\xc2\xa9 .NET Foundation and contributors"});
143 
144  // Year-qualified form must still work
145  const char* content3 = "Copyright \xc2\xa9 2021 .NET Foundation\n";
146  scannerTest(sc, content3, "statement",
147  {"Copyright \xc2\xa9 2021 .NET Foundation"});
148  }
149 
155  {
157 
158  // All bare-keyword variants must produce NO match
159  const char* bare[] = {
160  "copyright\n",
161  "Copyright\n",
162  "COPYRIGHT\n",
163  "Copyrights\n",
164  nullptr
165  };
166  for (int i = 0; bare[i]; ++i)
167  {
168  list<match> matches;
169  sc.ScanString(bare[i], matches);
170  CPPUNIT_ASSERT_MESSAGE(
171  string("Expected no match for bare keyword: ") + bare[i],
172  matches.empty());
173  }
174 
175  // Strings with real content must still produce exactly one match each
176  const char* valid[] = {
177  "Copyright 2021 .NET Foundation\n",
178  "Copyright (c) 2004 My Company\n",
179  "Copyright \xc2\xa9 .NET Foundation and contributors\n",
180  nullptr
181  };
182  for (int i = 0; valid[i]; ++i)
183  {
184  list<match> matches;
185  sc.ScanString(valid[i], matches);
186  CPPUNIT_ASSERT_MESSAGE(
187  string("Expected one match for valid copyright: ") + valid[i],
188  matches.size() == 1);
189  }
190  }
191 
198  {
200 
201  // All must produce exactly one active match
202  const char* valid[] = {
203  "Copyrighted (C) 1994 Normunds Saumanis (normunds@rx.tech.swh.lv)\n",
204  "Copyrighted (C) 1994, 1995, 1996 Normunds Saumanis (normunds@fi.ibm.com)\n",
205  "copyrighted (C) 1993 by Hartmut Schirmer\n",
206  "copyrighted 1992 by Mark Adler version c10p1, 10 January 1993\n",
207  "copyrighted 1990 Mark Adler\n",
208  "Copyright (C) Tom Long Nguyen (tom.l.nguyen@intel.com)\n",
209  "Copyright (C) Torres Martinez\n",
210  "Copyright (C) Tobias Schmidt\n",
211  "Copyright (C) Tomas Novak\n",
212  nullptr
213  };
214  for (int i = 0; valid[i]; ++i)
215  {
216  list<match> matches;
217  sc.ScanString(valid[i], matches);
218  // Must have at least one match and every match must be active
219  CPPUNIT_ASSERT_MESSAGE(
220  string("Expected active match for: ") + valid[i],
221  !matches.empty() && matches.front().is_enabled);
222  }
223  }
224 
231  {
233 
234  // Each string is © + short ASCII run + non-ASCII noise.
235  // None must produce an active match.
236  const char* noise[] = {
237  // 3-char ASCII prefix then non-ASCII
238  "\xc2\xa9 sjw\xc2\xa8noise\n",
239  "\xc2\xa9 OMr\xc2\xa5more\n",
240  "\xc2\xa9 tGa\xc3\x89garbage\n",
241  // 4-char ASCII prefix then non-ASCII
242  "\xc2\xa9 ZgU,\xc2\xb5garbage\n",
243  "\xc2\xa9 VJs0\xc3\x93noise\n",
244  // 7-char ASCII prefix then non-ASCII
245  "\xc2\xa9 NuXnHl{\xc2\xa4" "noise\n",
246  // 8-char ASCII prefix then non-ASCII
247  "\xc2\xa9 KtCtdy\x22s\xc3\xa8noise\n",
248  nullptr
249  };
250 
251  for (int i = 0; noise[i]; ++i)
252  {
253  list<match> matches;
254  sc.ScanString(noise[i], matches);
255  bool hasActive = !matches.empty() && matches.front().is_enabled;
256  CPPUNIT_ASSERT_MESSAGE(
257  string("Expected no active match for binary noise string #") + to_string(i),
258  !hasActive);
259  }
260  }
261 
268  {
270 
271  // Normal closed array: 1 deactivated header + 3 active elements
272  {
273  const char content[] =
274  "SPDX-FileCopyrightText = [\n"
275  "\"2026 Fraunhofer-Institut f\xC3\xBCr Produktionstechnik und Automatisierung IPA\",\n"
276  "\"2026 Hilscher Gesellschaft f\xC3\xBCr Systemautomation mbH\",\n"
277  "\"2026 Siemens AG\",\n"
278  "]\n";
279 
280  list<match> matches;
281  sc.ScanString(content, matches);
282 
283  CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected 4 matches total", (size_t)4, matches.size());
284 
285  auto it = matches.begin();
286  CPPUNIT_ASSERT_MESSAGE("Header must be inactive", !it->is_enabled);
287  ++it;
288  for (int i = 1; i <= 3; ++i, ++it)
289  CPPUNIT_ASSERT_MESSAGE(
290  string("Array element ") + to_string(i) + " must be active", it->is_enabled);
291  }
292 
293  // Malformed array (no closing ]): scan must continue past the block and
294  // still detect copyright statements that follow it.
295  {
296  const char content[] =
297  "SPDX-FileCopyrightText = [\n"
298  "\"2026 Company A\",\n"
299  "\"2026 Company B\",\n"
300  // no closing ]
301  "Copyright 2021 Google LLC\n";
302 
303  list<match> matches;
304  sc.ScanString(content, matches);
305 
306  bool foundGoogle = false;
307  for (auto& m : matches) {
308  int len = m.end - m.start;
309  if (len > 0 && strncmp(content + m.start, "Copyright 2021 Google", 21) == 0)
310  foundGoogle = true;
311  }
312  CPPUNIT_ASSERT_MESSAGE(
313  "Copyright after unclosed SPDX array must still be detected", foundGoogle);
314  }
315  }
316 
324  {
326 
327  // Two lines with German umlauts (ü = \xC3\xBC), one plain ASCII line.
328  const char content[] =
329  "// SPDX-FileCopyrightText: 2026 Fraunhofer-Institut f\xC3\xBCr Produktionstechnik und Automatisierung IPA\n"
330  "// SPDX-FileCopyrightText: 2026 Hilscher Gesellschaft f\xC3\xBCr Systemautomation mbH\n"
331  "// SPDX-FileCopyrightText: 2026 Siemens AG\n";
332 
333  list<match> matches;
334  sc.ScanString(content, matches);
335 
336  CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected 3 SPDX matches", (size_t)3, matches.size());
337 
338  // Verify each match is active and its end position reaches the line's '\n'
339  const char* lineStarts[3];
340  lineStarts[0] = strstr(content, "SPDX-FileCopyrightText: 2026 Fraunhofer");
341  lineStarts[1] = strstr(content, "SPDX-FileCopyrightText: 2026 Hilscher");
342  lineStarts[2] = strstr(content, "SPDX-FileCopyrightText: 2026 Siemens");
343 
344  int i = 0;
345  for (auto& m : matches)
346  {
347  CPPUNIT_ASSERT_MESSAGE(
348  string("Match ") + to_string(i) + " must be active",
349  m.is_enabled);
350 
351  int expectedStart = lineStarts[i] - content;
352  int expectedEnd = (int)(strchr(lineStarts[i], '\n') - content);
353 
354  CPPUNIT_ASSERT_EQUAL_MESSAGE(
355  string("Match ") + to_string(i) + " start",
356  expectedStart, m.start);
357  CPPUNIT_ASSERT_EQUAL_MESSAGE(
358  string("Match ") + to_string(i) + " end must reach line end",
359  expectedEnd, m.end);
360  ++i;
361  }
362  }
363 
378  {
380 
381  // All must produce NO active match (either deactivated or discarded)
382  const char* prose[] = {
383  "copyrights appearing in this test file\n",
384  "copyrights appears in the documentation\n",
385  "COPYRIGHT TO DETECT This section uses the standard header\n",
386  "copyright work that can be distributed\n",
387  "copyright protection under the terms of this License\n",
388  "copyrighted interfaces, the original copyright holder\n",
389  "copyright in the work, if the License is applied\n",
390  "copyright of this Package, but belong to whoever generated\n",
391  "copyright on material distributed under this License\n",
392  nullptr
393  };
394  for (int i = 0; prose[i]; ++i)
395  {
396  list<match> matches;
397  sc.ScanString(prose[i], matches);
398  bool hasActive = !matches.empty() && matches.front().is_enabled;
399  CPPUNIT_ASSERT_MESSAGE(
400  string("Expected no active match for prose: ") + prose[i],
401  !hasActive);
402  }
403 
404  // These must remain KEPT — year or (C)+year acts as holder separator
405  const char* valid[] = {
406  "Copyright (C) 2021 Toronto Inc.\n",
407  "Copyright 2021 Workday Inc.\n",
408  "Copyright (C) 2021 Interface Logic Inc.\n",
409  "Copyright (C) 2021 In-N-Out Burgers\n",
410  nullptr
411  };
412  for (int i = 0; valid[i]; ++i)
413  {
414  list<match> matches;
415  sc.ScanString(valid[i], matches);
416  CPPUNIT_ASSERT_MESSAGE(
417  string("Expected active match for: ") + valid[i],
418  !matches.empty() && matches.front().is_enabled);
419  }
420  }
421 
430  {
431  regexScanner sc("author", "copyright");
432  scannerTest(sc, testContent, "author", {
433  "Written by: me, myself and Irene.",
434  "Authors all the people at ABC",
435  "Author1",
436  "Author1 Author2 Author3",
437  "Author4",
438  "maintained by benjamin drieu <benj@debian.org>"
439  });
440  }
441 
449  void regIpraTest () {
450  regexScanner sc("ipra", "ipra");
451  scannerTest(sc, testContent, "ipra", { "US patents 1 , 2 ,3" });
452  }
453 
461  void regEccTest () {
462  regexScanner sc("ecc", "ecc");
463  scannerTest(sc, testContent, "ecc", { "space vehicle designed by NASA" });
464  }
465 
473  void regUrlTest () {
474  regexScanner sc("url", "copyright");
475  scannerTest(sc, testContent, "url", { "http://mysite.org/FAQ" });
476  }
477 
485  void regEmailTest () {
486  regexScanner sc("email", "copyright",1);
487  scannerTest(sc, testContent, "email", { "info@mysite.org", "benj@debian.org" });
488  }
489 
497  void regKeywordTest () {
498  regexScanner sc("keyword", "keyword");
499  scannerTest(sc, testContent, "keyword", {"patent", "licensed as", "stolen from"});
500  }
501 
510  void cleanEntries () {
511  // Binary content
512  string actualFileContent;
513  ReadFileToString("../testdata/testdata142", actualFileContent);
514 
515  vector<string> binaryStrings;
516  std::stringstream *ss = new std::stringstream(actualFileContent);
517  string temp;
518 
519  while (std::getline(*ss, temp)) {
520  binaryStrings.push_back(temp);
521  }
522 
523  // Simulate matches. Each line is a match
524  vector<match> matches;
525  int pos = 0;
526  int size = binaryStrings.size();
527  for (int i = 0; i < size; i++)
528  {
529  int length = binaryStrings[i].length();
530  matches.push_back(
531  match(pos, pos + length, "statement"));
532  pos += length + 1;
533  }
534 
535  // Expected data
536  string expectedFileContent;
537  ReadFileToString("../testdata/testdata142_exp", expectedFileContent);
538 
539  delete(ss);
540  ss = new std::stringstream(expectedFileContent);
541  vector<string> expectedStrings;
542  while (std::getline(*ss, temp)) {
543  expectedStrings.push_back(temp);
544  }
545 
546  vector<string> actualStrings;
547  for (size_t i = 0; i < matches.size(); i ++)
548  {
549  actualStrings.push_back(cleanMatch(actualFileContent, matches[i]));
550  }
551 
552  CPPUNIT_ASSERT(expectedStrings == actualStrings);
553  }
554 };
555 
556 CPPUNIT_TEST_SUITE_REGISTRATION( scannerTestSuite );
Implementation of scanner class for copyright.
Definition: copyscan.hpp:39
void ScanString(const string &s, list< match > &results) const
Scan a given string for copyright statements.
Definition: copyscan.cc:63
Provides a regex scanner using predefined regexs.
Definition: regscan.hpp:21
void regIpraTest()
Test Ipra scanner.
void copyscannerCopyrightedStatementTest()
Regression: "copyrighted" statements and names like "Tom" must not be falsely deactivated by REG_EXCE...
void copyscannerBinaryNoiseTest()
Regression: binary-file content with a short ASCII prefix before non-ASCII bytes must not be reported...
void regKeywordTest()
Test copyright scanner for keywords.
void copyscannerSpdxFullLineTest()
Regression: SPDX-FileCopyrightText entries must be detected as individual single-line statements and ...
void scannerTest(const scanner &sc, const char *content, const string &type, list< const char * > expectedStrings)
Runs scanner on content and check matches against expectedStrings.
void copyscannerBareKeywordDiscardTest()
Test that bare copyright keywords produce no matches.
void copyscannerDotPrefixedNameTest()
Test copyright scanner with dot-prefixed names like .NET Foundation.
void regUrlTest()
Test copyright scanner for URL.
void regEmailTest()
Test copyright scanner for email.
void copyscannerTest()
Test copyright scanner.
void regEccTest()
Test ECC scanner.
void copyscannerProseExceptionTest()
Regression: license-prose strings that contain "copyright" as a common noun must be deactivated,...
void regAuthorTest()
Test copyright scanner for author.
void cleanEntries()
Test cleanMatch() to remove non-UTF8 text and extra spaces.
void copyscannerSpdxArrayTest()
Regression: SPDX-FileCopyrightText = [...] TOML array format must yield one active match per quoted e...
Abstract class to provide interface to scanners.
Definition: scanners.hpp:59
virtual void ScanString(const string &s, list< match > &results) const =0
Scan the given string and add matches to results.
int s
The socket that the CLI will use to communicate.
Definition: fo_cli.c:37
int valid
If the information stored in buffer is valid.
bool ReadFileToString(const string &fileName, string &out)
Utility: read file to string from scanners.h.
Definition: scanners.cc:21
Store the results of a regex match.
Definition: scanners.hpp:28
std::ostream & operator<<(std::ostream &os, const std::vector< int > &x)
<< operator overload to appends a vector to an ostream object
Definition: testUtils.hpp:27