Source code for atarashi.libs.initialmatch
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Copyright 2018 Aman Jain (amanjain5221@gmail.com)
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
version 2 as published by the Free Software Foundation.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""
__author__ = "Aman Jain"
__email__ = "amanjain5221@gmail.com"
import itertools
[docs]def spdx_identifer(data, shortnames):
'''
Identify SPDX-License-Identifier
Make sure the identifier must be present in Fossology merged license list
:param data: Input File data
:param shortnames: Array of shortnames (SPDX-ID)
:return: Array of JSON with scanning results
'''
data = data.lower() # preprocessing of data
shortnamesLow = [shortname.lower() for shortname in shortnames]
tokenized_data = data.split('\n')
possible_spdx = []
for idx in range(len(tokenized_data)):
if "spdx-license-identifier:" in tokenized_data[idx] or "license:" in tokenized_data[idx]:
possible_spdx.append(tokenized_data[idx])
spdx_identifiers = []
for identifiers in possible_spdx:
for x in identifiers.split(" "):
if x in shortnamesLow:
shortnameIndex = shortnamesLow.index(x)
spdx_identifiers.append({
'shortname': shortnames[shortnameIndex],
'sim_type': 'SPDXIdentifier',
'sim_score': 1.0,
'description': ''
})
return spdx_identifiers
[docs]def initial_match(filePath, processedData, licenses):
'''
:param inputFile: Input file path
:param licenseList: Processed License List path
:return: Array of JSON with scanning results from spdx_identifer and HeadersNgramSim
'''
with open(filePath) as file:
raw_data = file.read()
# Match SPDX identifiers
spdx_identifiers = spdx_identifer(raw_data, licenses['shortname'])
# match with headers
# similarity with headers
exact_match_header = []
header_sim_match = []
for idx in range(len(licenses)):
header = licenses.iloc[idx]['processed_header']
if len(header) > 0:
if header in processedData:
exact_match_header.append({
'shortname': licenses.iloc[idx]['shortname'],
'sim_type': 'ExactHeader',
'sim_score': 1.0,
'description': ''
})
ngram_sim = HeadersNgramSim(header, processedData)
if ngram_sim >= 0.7:
header_sim_match.append({
'shortname': licenses.iloc[idx]['shortname'],
'sim_type': 'HeaderNgramSimilarity',
'sim_score': ngram_sim,
'description': ''
})
# match with full text
exact_match_fulltext = []
for idx in range(len(licenses)):
full_text = licenses.iloc[idx]['processed_text']
if full_text in processedData:
exact_match_fulltext.append({
'shortname': licenses.iloc[idx]['shortname'],
'sim_type': 'ExactFullText',
'sim_score': 1.0,
'description': ''
})
matches = list(itertools.chain(spdx_identifiers, exact_match_header, exact_match_fulltext, header_sim_match[:5]))
return matches