11 #include "file_operations.h"
13 static inline void doFindAllMatches(
const File* file,
const GArray* licenseArray,
14 guint tPos, guint sPos,
15 unsigned maxAllowedDiff,
unsigned minAdjacentMatches,
22 for (guint i = 0; i < licenseArray->len; i++) {
24 findDiffMatches(file,
license, tPos, sPos, matches, maxAllowedDiff, minAdjacentMatches);
28 GArray* findAllMatchesBetween(
const File* file,
const Licenses* licenses,
29 unsigned maxAllowedDiff,
unsigned minAdjacentMatches,
unsigned maxLeadingDiff) {
30 GArray* matches = g_array_new(FALSE, FALSE,
sizeof(
Match*));
32 const GArray* textTokens = file->tokens;
33 const guint textLength = textTokens->len;
35 for (guint tPos = 0; tPos < textLength; tPos++) {
36 for (guint sPos = 0; sPos <= maxLeadingDiff; sPos++) {
37 const GArray* availableLicenses = getLicenseArrayFor(licenses, sPos, textTokens, tPos);
38 doFindAllMatches(file, availableLicenses, tPos, sPos, maxAllowedDiff, minAdjacentMatches, matches);
42 const GArray* shortLicenses = getShortLicenseArray(licenses);
43 doFindAllMatches(file, shortLicenses, tPos, 0, 0, 1, matches);
46 return filterNonOverlappingMatches(matches);
49 void match_array_free(GArray* matches) {
50 #if GLIB_CHECK_VERSION(2, 32, 0)
51 g_array_set_clear_func(matches, match_destroyNotify);
53 for (
unsigned int i=0; i< matches->len; ++i) {
54 Match* tmp = g_array_index(matches,
Match*, i);
58 g_array_free(matches, TRUE);
62 GArray* matches = findAllMatchesBetween(file, licenses,
63 MAX_ALLOWED_DIFF_LENGTH, MIN_ADJACENT_MATCHES, MAX_LEADING_DIFF);
64 int result = processMatches(state, file, matches, callbacks);
67 match_array_free(matches);
72 static char* getFileName(
MonkState* state,
long pFileId) {
76 printf(
"file not found for pFileId=%ld\n", pFileId);
81 #ifdef MONK_MULTI_THREAD
82 #pragma omp critical(getFileName)
89 printf(
"file '%s' not found\n", pFile);
102 file.fileName = getFileName(state, pFileId);
104 if (file.fileName != NULL) {
105 result = readTokensFromFile(file.fileName, &(file.tokens), delimiters);
110 tokens_free(file.tokens);
119 char* formatMatchArray(GArray* matchInfo) {
120 GString* stringBuilder = g_string_new(
"");
122 size_t len = matchInfo->len;
123 for (
size_t i = 0; i < len; i++) {
126 if (current->text.length > 0) {
127 g_string_append_printf(stringBuilder,
129 current->text.start, current->text.length, current->diffType);
132 g_string_append_printf(stringBuilder,
134 current->text.start, current->diffType);
137 if (current->search.length > 0) {
138 g_string_append_printf(stringBuilder,
140 current->search.start, current->search.length);
143 g_string_append_printf(stringBuilder,
145 current->search.start);
149 g_string_append_printf(stringBuilder,
", ");
153 return g_string_free(stringBuilder, FALSE);
156 static unsigned short match_rank(
Match*
match) {
157 if (
match->type == MATCH_TYPE_FULL) {
163 unsigned int licenseLength =
license->tokens->
len;
164 size_t numberOfMatches = diffResult->matched;
165 size_t numberOfAdditions = diffResult->added;
168 double rank = (100.0 * numberOfMatches) / (licenseLength + numberOfAdditions);
169 int result = (int) rank;
171 result =
MIN(result, 99);
172 result =
MAX(result, 1);
174 diffResult->rank = rank;
175 diffResult->percentual = (
unsigned short) result;
177 return (
unsigned short) result;
182 return match->type == MATCH_TYPE_FULL;
186 if (match_isFull(
match)) {
190 GArray* matchedInfo =
match->ptr.diff->matchedInfo;
194 return firstDiff.start;
199 if (match_isFull(
match)) {
203 GArray* matchedInfo =
match->ptr.diff->matchedInfo;
207 return lastDiff.start + lastDiff.length;
211 static int match_includes(
const Match* big,
const Match* small) {
212 return (match_getStart(big) <= match_getStart(small)) && (match_getEnd(big) >= match_getEnd(small));
217 if (match_isFull(
match)) {
221 return match->ptr.diff->rank;
225 static int compareMatchByRank(
const Match* matchA,
const Match* matchB) {
226 double matchARank = match_getRank(matchA);
227 double matchBRank = match_getRank(matchB);
229 if (matchARank > matchBRank) {
232 if (matchARank < matchBRank) {
240 static int licenseIncludes(
const License* big,
const License* small) {
241 const GArray* tokensBig = big->tokens;
242 const GArray* tokensSmall = small->tokens;
244 const guint bigLen = tokensBig->len;
245 const guint smallLen = tokensSmall->len;
251 if (smallLen > bigLen) {
255 for (guint i = 0; i < bigLen; i++) {
256 unsigned n = smallLen;
257 if (matchNTokens(tokensBig, i, bigLen, tokensSmall, 0, smallLen, n)) {
265 int licensesDiffer(
const License *thisLicense,
const License *otherLicense) {
266 return (thisLicense->refId != otherLicense->refId);
276 int match_partialComparator(
const Match* thisMatch,
const Match* otherMatch) {
277 const int thisIncludesOther = match_includes(thisMatch, otherMatch);
278 const int otherIncludesThis = match_includes(otherMatch, thisMatch);
279 const License *thisLicense = thisMatch->license;
280 const License *otherLicense = otherMatch->license;
283 if (thisIncludesOther || otherIncludesThis) {
284 if (match_isFull(thisMatch) && thisIncludesOther) {
287 if (match_isFull(otherMatch) && otherIncludesThis) {
292 if (licensesDiffer(thisLicense, otherLicense)) {
293 if (licenseIncludes(thisLicense, otherLicense)) {
296 if (licenseIncludes(otherLicense, thisLicense)) {
299 if (match_isFull(otherMatch) && thisIncludesOther) {
305 return (compareMatchByRank(thisMatch, otherMatch) >= 0) ? 1 : -1;
314 GArray* filterNonOverlappingMatches(GArray* matches) {
315 const guint len = matches->len;
319 for (guint i = 0; i < len; i++) {
320 Match* thisMatch = match_array_index(matches, i);
321 if (thisMatch == NULL) {
325 for (guint j = i + 1; j < len; j++) {
326 Match* otherMatch = match_array_index(matches, j);
327 if (otherMatch == NULL) {
331 gint comparison = match_partialComparator(thisMatch, otherMatch);
333 if (comparison > 0) {
334 match_free(otherMatch);
335 match_array_index(matches, j) = NULL;
337 else if (comparison < 0) {
338 match_free(thisMatch);
339 match_array_index(matches, i) = NULL;
345 GArray* result = g_array_new(FALSE, FALSE,
sizeof(
Match*));
346 for (guint i = 0; i < len; i++) {
347 Match* thisMatch = match_array_index(matches, i);
349 g_array_append_val(result, thisMatch);
353 g_array_free(matches, TRUE);
360 if (
match->type == MATCH_TYPE_DIFF) {
363 convertToAbsolutePositions(diffResult->matchedInfo, file->tokens,
license->tokens);
364 return callbacks->onDiff(state, file,
license, diffResult);
368 matchInfo.text = getFullHighlightFor(file->tokens,
match->ptr.full->
start,
match->ptr.full->length);
369 matchInfo.search = getFullHighlightFor(
license->tokens, 0,
license->tokens->
len);
370 matchInfo.diffType = FULL_MATCH;
372 return callbacks->onFull(state, file,
license, &matchInfo);
377 if (callbacks->ignore && callbacks->ignore(state, file)) {
381 if (callbacks->onAll) {
382 return callbacks->onAll(state, file, matches);
385 callbacks->onBeginOutput(state);
387 const guint matchCount = matches->len;
390 if (matchCount == 0) {
391 result = callbacks->onNo(state, file);
394 for (guint matchIndex = 0; result && (matchIndex < matchCount); matchIndex++) {
395 const Match*
match = match_array_index(matches, matchIndex);
396 result &= processMatch(state, file,
match, callbacks);
397 if (matchIndex != matchCount - 1) {
398 callbacks->onBetweenIndividualOutputs(state);
402 callbacks->onEndOutput(state);
412 if (diffResult->matchedInfo->len == 1 && (diffResult->matched ==
license->tokens->
len)) {
413 newMatch->type = MATCH_TYPE_FULL;
414 newMatch->ptr.full = malloc(
sizeof(
DiffPoint));
415 *(newMatch->ptr.full) = g_array_index(diffResult->matchedInfo,
DiffMatchInfo, 0).text;
416 diffResult_free(diffResult);
419 newMatch->type = MATCH_TYPE_DIFF;
420 newMatch->ptr.diff = diffResult;
427 size_t textStartPosition,
size_t searchStartPosition,
429 unsigned int maxAllowedDiff,
unsigned int minAdjacentMatches) {
431 if (!matchNTokens(file->tokens, textStartPosition, file->tokens->len,
433 minAdjacentMatches)) {
438 textStartPosition, searchStartPosition,
439 maxAllowedDiff, minAdjacentMatches);
442 Match* newMatch = diffResult2Match(diffResult,
license);
444 if (match_rank(newMatch) > MIN_ALLOWED_RANK)
445 g_array_append_val(matches, newMatch);
447 match_free(newMatch);
452 #if GLIB_CHECK_VERSION(2, 32, 0)
454 void match_destroyNotify(gpointer matchP) {
455 match_free(*((
Match**) matchP));
461 if (
match->type == MATCH_TYPE_DIFF) {
462 diffResult_free(
match->ptr.diff);
465 free(
match->ptr.full);
void matchPFileWithLicenses(CopyrightState const &state, int agentId, unsigned long pFileId, CopyrightDatabaseHandler &databaseHandler)
Get the file contents, scan for statements and save findings to database.
void matchFileWithLicenses(const string &sContent, unsigned long pFileId, CopyrightState const &state, int agentId, CopyrightDatabaseHandler &databaseHandler)
Scan a given file with all available scanners and save findings to database.
char * queryPFileForFileId(fo_dbManager *dbManager, long fileId)
Get the pfile name for a given file ID.
char * fo_RepMkPath(const char *Type, char *Filename)
Given a filename, construct the full path to the file.
#define MIN(a, b)
Min of two.
#define MAX(a, b)
Max of two.
int len
Length of pattern.
Store the results of a regex match.