Source code for atarashi.libs.initialmatch

#!/usr/bin/python
# -*- coding: utf-8 -*-

"""
Copyright 2018 Aman Jain (amanjain5221@gmail.com)

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
version 2 as published by the Free Software Foundation.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
"""

__author__ = "Aman Jain"
__email__ = "amanjain5221@gmail.com"

import itertools


[docs]def HeadersNgramSim(header, processedData): ''' Creates array of ngrams Check with the processed data how much are matching sim_score = matches/ count of ngrams :param header: License Header :param processedData: Input file extracted and processed data :return: Array of JSON with scanning results ''' header = header.split(" ") ngrams = [] for i in range(3, 8): ngrams += [header[j:j + i] for j in range(len(header) - i + 1)] count = 0 for ngram in ngrams: if ' '.join(ngram) in processedData: count += 1 sim = 0 if len(ngrams) > 0: sim = float(count) / float(len(ngrams)) return sim
[docs]def spdx_identifer(data, shortnames): ''' Identify SPDX-License-Identifier Make sure the identifier must be present in Fossology merged license list :param data: Input File data :param shortnames: Array of shortnames (SPDX-ID) :return: Array of JSON with scanning results ''' data = data.lower() # preprocessing of data shortnamesLow = [shortname.lower() for shortname in shortnames] tokenized_data = data.split('\n') possible_spdx = [] for idx in range(len(tokenized_data)): if "spdx-license-identifier:" in tokenized_data[idx] or "license:" in tokenized_data[idx]: possible_spdx.append(tokenized_data[idx]) spdx_identifiers = [] for identifiers in possible_spdx: for x in identifiers.split(" "): if x in shortnamesLow: shortnameIndex = shortnamesLow.index(x) spdx_identifiers.append({ 'shortname': shortnames[shortnameIndex], 'sim_type': 'SPDXIdentifier', 'sim_score': 1.0, 'description': '' }) return spdx_identifiers
[docs]def initial_match(filePath, processedData, licenses): ''' :param inputFile: Input file path :param licenseList: Processed License List path :return: Array of JSON with scanning results from spdx_identifer and HeadersNgramSim ''' with open(filePath) as file: raw_data = file.read() # Match SPDX identifiers spdx_identifiers = spdx_identifer(raw_data, licenses['shortname']) # match with headers # similarity with headers exact_match_header = [] header_sim_match = [] for idx in range(len(licenses)): header = licenses.iloc[idx]['processed_header'] if len(header) > 0: if header in processedData: exact_match_header.append({ 'shortname': licenses.iloc[idx]['shortname'], 'sim_type': 'ExactHeader', 'sim_score': 1.0, 'description': '' }) ngram_sim = HeadersNgramSim(header, processedData) if ngram_sim >= 0.7: header_sim_match.append({ 'shortname': licenses.iloc[idx]['shortname'], 'sim_type': 'HeaderNgramSimilarity', 'sim_score': ngram_sim, 'description': '' }) # match with full text exact_match_fulltext = [] for idx in range(len(licenses)): full_text = licenses.iloc[idx]['processed_text'] if full_text in processedData: exact_match_fulltext.append({ 'shortname': licenses.iloc[idx]['shortname'], 'sim_type': 'ExactFullText', 'sim_score': 1.0, 'description': '' }) matches = list(itertools.chain(spdx_identifiers, exact_match_header, exact_match_fulltext, header_sim_match[:5])) return matches