FOSSology  4.4.0
Open Source License Compliance by Open Source Software
match.c
1 /*
2  Authors: Daniele Fognini, Andreas Wuerl, Marion Deveaud
3  SPDX-FileCopyrightText: © 2013-2015 Siemens AG
4 
5  SPDX-License-Identifier: GPL-2.0-only
6 */
7 
8 #include "match.h"
9 
10 #include "license.h"
11 #include "file_operations.h"
12 
13 static inline void doFindAllMatches(const File* file, const GArray* licenseArray,
14  guint tPos, guint sPos,
15  unsigned maxAllowedDiff, unsigned minAdjacentMatches,
16  GArray* matches) {
17  if (!licenseArray) {
18  /* we hope to get here very often */
19  return;
20  }
21 
22  for (guint i = 0; i < licenseArray->len; i++) {
23  License* license = license_index(licenseArray, i);
24  findDiffMatches(file, license, tPos, sPos, matches, maxAllowedDiff, minAdjacentMatches);
25  }
26 }
27 
28 GArray* findAllMatchesBetween(const File* file, const Licenses* licenses,
29  unsigned maxAllowedDiff, unsigned minAdjacentMatches, unsigned maxLeadingDiff) {
30  GArray* matches = g_array_new(FALSE, FALSE, sizeof(Match*));
31 
32  const GArray* textTokens = file->tokens;
33  const guint textLength = textTokens->len;
34 
35  for (guint tPos = 0; tPos < textLength; tPos++) {
36  for (guint sPos = 0; sPos <= maxLeadingDiff; sPos++) {
37  const GArray* availableLicenses = getLicenseArrayFor(licenses, sPos, textTokens, tPos);
38  doFindAllMatches(file, availableLicenses, tPos, sPos, maxAllowedDiff, minAdjacentMatches, matches);
39  }
40 
41  /* now search short licenses only fully (i.e. maxAllowedDiff = 0, minAdjacentMatches = 1) */
42  const GArray* shortLicenses = getShortLicenseArray(licenses);
43  doFindAllMatches(file, shortLicenses, tPos, 0, 0, 1, matches);
44  }
45 
46  return filterNonOverlappingMatches(matches);
47 }
48 
49 void match_array_free(GArray* matches) {
50 #if GLIB_CHECK_VERSION(2, 32, 0)
51  g_array_set_clear_func(matches, match_destroyNotify);
52 #else
53  for (unsigned int i=0; i< matches->len; ++i) {
54  Match* tmp = g_array_index(matches, Match*, i);
55  match_free(tmp);
56  }
57 #endif
58  g_array_free(matches, TRUE);
59 }
60 
61 int matchFileWithLicenses(MonkState* state, const File* file, const Licenses* licenses, const MatchCallbacks* callbacks) {
62  GArray* matches = findAllMatchesBetween(file, licenses,
63  MAX_ALLOWED_DIFF_LENGTH, MIN_ADJACENT_MATCHES, MAX_LEADING_DIFF);
64  int result = processMatches(state, file, matches, callbacks);
65 
66  // we are done: free memory
67  match_array_free(matches);
68 
69  return result;
70 }
71 
72 static char* getFileName(MonkState* state, long pFileId) {
73  char* pFile = queryPFileForFileId(state->dbManager, pFileId);
74 
75  if (!pFile) {
76  printf("file not found for pFileId=%ld\n", pFileId);
77  return NULL;
78  }
79  char* pFileName;
80 
81 #ifdef MONK_MULTI_THREAD
82 #pragma omp critical(getFileName)
83 #endif
84  {
85  pFileName = fo_RepMkPath("files", pFile);
86  }
87 
88  if (!pFileName) {
89  printf("file '%s' not found\n", pFile);
90  }
91 
92  free(pFile);
93 
94  return pFileName;
95 }
96 
97 int matchPFileWithLicenses(MonkState* state, long pFileId, const Licenses* licenses, const MatchCallbacks* callbacks, char* delimiters) {
98  File file;
99  file.id = pFileId;
100  int result = 0;
101 
102  file.fileName = getFileName(state, pFileId);
103 
104  if (file.fileName != NULL) {
105  result = readTokensFromFile(file.fileName, &(file.tokens), delimiters);
106 
107  if (result) {
108  result = matchFileWithLicenses(state, &file, licenses, callbacks);
109 
110  tokens_free(file.tokens);
111  }
112 
113  free(file.fileName);
114  }
115 
116  return result;
117 }
118 
119 char* formatMatchArray(GArray* matchInfo) {
120  GString* stringBuilder = g_string_new("");
121 
122  size_t len = matchInfo->len;
123  for (size_t i = 0; i < len; i++) {
124  DiffMatchInfo* current = &g_array_index(matchInfo, DiffMatchInfo, i);
125 
126  if (current->text.length > 0) {
127  g_string_append_printf(stringBuilder,
128  "t[%zu+%zu] %s ",
129  current->text.start, current->text.length, current->diffType);
130  }
131  else {
132  g_string_append_printf(stringBuilder,
133  "t[%zu] %s ",
134  current->text.start, current->diffType);
135  }
136 
137  if (current->search.length > 0) {
138  g_string_append_printf(stringBuilder,
139  "s[%zu+%zu]",
140  current->search.start, current->search.length);
141  }
142  else {
143  g_string_append_printf(stringBuilder,
144  "s[%zu]",
145  current->search.start);
146  }
147 
148  if (i < len - 1) {
149  g_string_append_printf(stringBuilder, ", ");
150  }
151  }
152 
153  return g_string_free(stringBuilder, FALSE);
154 }
155 
156 static unsigned short match_rank(Match* match) {
157  if (match->type == MATCH_TYPE_FULL) {
158  return 100;
159  }
160  else {
161  DiffResult* diffResult = match->ptr.diff;
162  const License* license = match->license;
163  unsigned int licenseLength = license->tokens->len;
164  size_t numberOfMatches = diffResult->matched;
165  size_t numberOfAdditions = diffResult->added;
166 
167  // calculate result percentage as jaccard index
168  double rank = (100.0 * numberOfMatches) / (licenseLength + numberOfAdditions);
169  int result = (int) rank;
170 
171  result = MIN(result, 99);
172  result = MAX(result, 1);
173 
174  diffResult->rank = rank;
175  diffResult->percentual = (unsigned short) result;
176 
177  return (unsigned short) result;
178  }
179 }
180 
181 static int match_isFull(const Match* match) {
182  return match->type == MATCH_TYPE_FULL;
183 }
184 
185 size_t match_getStart(const Match* match) {
186  if (match_isFull(match)) {
187  return match->ptr.full->start;
188  }
189  else {
190  GArray* matchedInfo = match->ptr.diff->matchedInfo;
191 
192  DiffPoint firstDiff = g_array_index(matchedInfo, DiffMatchInfo, 0).text;
193 
194  return firstDiff.start;
195  }
196 }
197 
198 size_t match_getEnd(const Match* match) {
199  if (match_isFull(match)) {
200  return match->ptr.full->length + match->ptr.full->start;
201  }
202  else {
203  GArray* matchedInfo = match->ptr.diff->matchedInfo;
204 
205  DiffPoint lastDiff = g_array_index(matchedInfo, DiffMatchInfo, matchedInfo->len - 1).text;
206 
207  return lastDiff.start + lastDiff.length;
208  }
209 }
210 
211 static int match_includes(const Match* big, const Match* small) {
212  return (match_getStart(big) <= match_getStart(small)) && (match_getEnd(big) >= match_getEnd(small));
213 }
214 
215 // make sure to call it after a match_rank or the result will be junk
216 double match_getRank(const Match* match) {
217  if (match_isFull(match)) {
218  return 100.0;
219  }
220  else {
221  return match->ptr.diff->rank;
222  }
223 }
224 
225 static int compareMatchByRank(const Match* matchA, const Match* matchB) {
226  double matchARank = match_getRank(matchA);
227  double matchBRank = match_getRank(matchB);
228 
229  if (matchARank > matchBRank) {
230  return 1;
231  }
232  if (matchARank < matchBRank) {
233  return -1;
234  }
235 
236  return 0;
237 }
238 
239 /* profiling says there is no need to cache the comparison */
240 static int licenseIncludes(const License* big, const License* small) {
241  const GArray* tokensBig = big->tokens;
242  const GArray* tokensSmall = small->tokens;
243 
244  const guint bigLen = tokensBig->len;
245  const guint smallLen = tokensSmall->len;
246 
247  if (smallLen == 0) {
248  return 1;
249  }
250 
251  if (smallLen > bigLen) {
252  return 0;
253  }
254 
255  for (guint i = 0; i < bigLen; i++) {
256  unsigned n = smallLen;
257  if (matchNTokens(tokensBig, i, bigLen, tokensSmall, 0, smallLen, n)) {
258  return 1;
259  }
260  }
261 
262  return 0;
263 }
264 
265 int licensesDiffer(const License *thisLicense, const License *otherLicense) {
266  return (thisLicense->refId != otherLicense->refId);
267 }
268 
269 /* N.B. this is only a partial order of matches
270  *
271  * =0 not comparable
272  * >0 thisMatch >= otherMatch
273  * <0 thisMatch < otherMatch
274  *
275  **/
276 int match_partialComparator(const Match* thisMatch, const Match* otherMatch) {
277  const int thisIncludesOther = match_includes(thisMatch, otherMatch);
278  const int otherIncludesThis = match_includes(otherMatch, thisMatch);
279  const License *thisLicense = thisMatch->license;
280  const License *otherLicense = otherMatch->license;
281 
282  //Verify if matches overlap
283  if (thisIncludesOther || otherIncludesThis) {
284  if (match_isFull(thisMatch) && thisIncludesOther) {
285  return 1;
286  }
287  if (match_isFull(otherMatch) && otherIncludesThis) {
288  return -1;
289  }
290 
291  //Verify if licenses overlap
292  if (licensesDiffer(thisLicense, otherLicense)) {
293  if (licenseIncludes(thisLicense, otherLicense)) {
294  return 1;
295  }
296  if (licenseIncludes(otherLicense, thisLicense)) {
297  return -1;
298  }
299  if (match_isFull(otherMatch) && thisIncludesOther) {
300  //a complete different license is included in this match
301  return 0;
302  }
303  }
304 
305  return (compareMatchByRank(thisMatch, otherMatch) >= 0) ? 1 : -1;
306  }
307  return 0;
308 }
309 
310 /*
311  * finds the maximal matches according to match_partialComparator
312  * destructively filter matches array: input array and discarded matches are automatically freed
313  **/
314 GArray* filterNonOverlappingMatches(GArray* matches) {
315  const guint len = matches->len;
316 
317  /* profiling says this is not time critical and worst case is O(n^2) with any algorithm */
318  /* instead of removing elements from the array set them to NULL and create a new array at the end */
319  for (guint i = 0; i < len; i++) {
320  Match* thisMatch = match_array_index(matches, i);
321  if (thisMatch == NULL) {
322  continue;
323  }
324 
325  for (guint j = i + 1; j < len; j++) {
326  Match* otherMatch = match_array_index(matches, j);
327  if (otherMatch == NULL) {
328  continue;
329  }
330 
331  gint comparison = match_partialComparator(thisMatch, otherMatch);
332 
333  if (comparison > 0) {
334  match_free(otherMatch);
335  match_array_index(matches, j) = NULL;
336  }
337  else if (comparison < 0) {
338  match_free(thisMatch);
339  match_array_index(matches, i) = NULL;
340  break;
341  }
342  }
343  }
344 
345  GArray* result = g_array_new(FALSE, FALSE, sizeof(Match*));
346  for (guint i = 0; i < len; i++) {
347  Match* thisMatch = match_array_index(matches, i);
348  if (thisMatch) {
349  g_array_append_val(result, thisMatch);
350  }
351  }
352 
353  g_array_free(matches, TRUE);
354 
355  return result;
356 }
357 
358 int processMatch(MonkState* state, const File* file, const Match* match, const MatchCallbacks* callbacks) {
359  const License* license = match->license;
360  if (match->type == MATCH_TYPE_DIFF) {
361  DiffResult* diffResult = match->ptr.diff;
362 
363  convertToAbsolutePositions(diffResult->matchedInfo, file->tokens, license->tokens);
364  return callbacks->onDiff(state, file, license, diffResult);
365  }
366  else {
367  DiffMatchInfo matchInfo;
368  matchInfo.text = getFullHighlightFor(file->tokens, match->ptr.full->start, match->ptr.full->length);
369  matchInfo.search = getFullHighlightFor(license->tokens, 0, license->tokens->len);
370  matchInfo.diffType = FULL_MATCH;
371 
372  return callbacks->onFull(state, file, license, &matchInfo);
373  }
374 }
375 
376 int processMatches(MonkState* state, const File* file, const GArray* matches, const MatchCallbacks* callbacks) {
377  if (callbacks->ignore && callbacks->ignore(state, file)) {
378  return 1;
379  }
380 
381  if (callbacks->onAll) {
382  return callbacks->onAll(state, file, matches);
383  }
384 
385  callbacks->onBeginOutput(state);
386 
387  const guint matchCount = matches->len;
388 
389  int result = 1;
390  if (matchCount == 0) {
391  result = callbacks->onNo(state, file);
392  }
393 
394  for (guint matchIndex = 0; result && (matchIndex < matchCount); matchIndex++) {
395  const Match* match = match_array_index(matches, matchIndex);
396  result &= processMatch(state, file, match, callbacks);
397  if (matchIndex != matchCount - 1) {
398  callbacks->onBetweenIndividualOutputs(state);
399  }
400  }
401 
402  callbacks->onEndOutput(state);
403 
404  return result;
405 }
406 
407 Match* diffResult2Match(DiffResult* diffResult, const License* license) {
408  Match* newMatch = malloc(sizeof(Match));
409  newMatch->license = license;
410 
411  /* it's full only if we have no diffs and the license was not truncated */
412  if (diffResult->matchedInfo->len == 1 && (diffResult->matched == license->tokens->len)) {
413  newMatch->type = MATCH_TYPE_FULL;
414  newMatch->ptr.full = malloc(sizeof(DiffPoint));
415  *(newMatch->ptr.full) = g_array_index(diffResult->matchedInfo, DiffMatchInfo, 0).text;
416  diffResult_free(diffResult);
417  }
418  else {
419  newMatch->type = MATCH_TYPE_DIFF;
420  newMatch->ptr.diff = diffResult;
421 
422  }
423  return newMatch;
424 }
425 
426 void findDiffMatches(const File* file, const License* license,
427  size_t textStartPosition, size_t searchStartPosition,
428  GArray* matches,
429  unsigned int maxAllowedDiff, unsigned int minAdjacentMatches) {
430 
431  if (!matchNTokens(file->tokens, textStartPosition, file->tokens->len,
432  license->tokens, searchStartPosition, license->tokens->len,
433  minAdjacentMatches)) {
434  return;
435  }
436 
437  DiffResult* diffResult = findMatchAsDiffs(file->tokens, license->tokens,
438  textStartPosition, searchStartPosition,
439  maxAllowedDiff, minAdjacentMatches);
440 
441  if (diffResult) {
442  Match* newMatch = diffResult2Match(diffResult, license);
443 
444  if (match_rank(newMatch) > MIN_ALLOWED_RANK)
445  g_array_append_val(matches, newMatch);
446  else {
447  match_free(newMatch);
448  }
449  }
450 }
451 
452 #if GLIB_CHECK_VERSION(2, 32, 0)
453 
454 void match_destroyNotify(gpointer matchP) {
455  match_free(*((Match**) matchP));
456 }
457 
458 #endif
459 
460 void match_free(Match* match) {
461  if (match->type == MATCH_TYPE_DIFF) {
462  diffResult_free(match->ptr.diff);
463  }
464  else {
465  free(match->ptr.full);
466  }
467  free(match);
468 }
void matchPFileWithLicenses(CopyrightState const &state, int agentId, unsigned long pFileId, CopyrightDatabaseHandler &databaseHandler)
Get the file contents, scan for statements and save findings to database.
void matchFileWithLicenses(const string &sContent, unsigned long pFileId, CopyrightState const &state, int agentId, CopyrightDatabaseHandler &databaseHandler)
Scan a given file with all available scanners and save findings to database.
char * queryPFileForFileId(fo_dbManager *dbManager, long fileId)
Get the pfile name for a given file ID.
Definition: libfossagent.c:122
char * fo_RepMkPath(const char *Type, char *Filename)
Given a filename, construct the full path to the file.
Definition: libfossrepo.c:352
#define MIN(a, b)
Min of two.
Definition: licenses.c:64
#define MAX(a, b)
Max of two.
Definition: licenses.c:63
Definition: diff.h:14
Definition: monk.h:61
Definition: monk.h:55
Definition: monk.h:67
Definition: match.h:20
Definition: monk.h:44
Definition: nomos.h:426
int len
Length of pattern.
Definition: nomos.h:427
Store the results of a regex match.
Definition: scanners.hpp:28
const int start
Definition: scanners.hpp:35