FOSSology  4.5.1
Open Source License Compliance by Open Source Software
SpdxReport.py
1 #!/usr/bin/env python3
2 
3 # SPDX-FileCopyrightText: © 2023,2025 Siemens AG
4 # SPDX-FileContributor: Gaurav Mishra <mishra.gaurav@siemens.com>
5 #
6 # SPDX-License-Identifier: GPL-2.0-only
7 
8 import hashlib
9 import logging
10 import re
11 from datetime import datetime
12 
13 from license_expression import (
14  get_spdx_licensing, LicenseExpression, combine_expressions
15 )
16 from spdx_tools.spdx.model import (
17  Actor, ActorType, Checksum, ChecksumAlgorithm, CreationInfo, Document, File,
18  FileType, Package, PackageVerificationCode, Relationship, RelationshipType,
19  SpdxNoAssertion, ExternalPackageRef, ExternalPackageRefCategory, SpdxNone,
20  ExtractedLicensingInfo
21 )
22 from spdx_tools.spdx.validation.document_validator import \
23  validate_full_spdx_document
24 from spdx_tools.spdx.validation.validation_message import ValidationMessage
25 from spdx_tools.spdx.writer.write_anything import write_file
26 
27 from .CliOptions import CliOptions
28 from .Scanners import Scanners, ScanResultList
29 
30 
31 class SpdxReport:
32  """
33  Handle SPDX reports.
34 
35  :ivar cli_options: CliOptions object
36  :ivar report_files: Dictionary of SPDX files with SPDX ID as key
37  :ivar license_package_set: Set of licenses found in package
38  :ivar creation_info: Report creation info
39  :ivar document: Report document
40  :ivar package: Report package
41  :ivar scanner: Scanners object
42  """
43 
44  def __init__(self, cli_options: CliOptions, scanner: Scanners):
45  """
46  :param cli_options: CliOptions to use
47  :param scanner: Scanners to use
48  """
49  self.cli_optionscli_options = cli_options
50  self._allowed_licenses_set_allowed_licenses_set = set(
51  self.cli_optionscli_options.allowlist.get('licenses', [])
52  )
53  self._license_cache: dict[str, LicenseExpression] = {}
54  self._spdx_lic_cache_spdx_lic_cache = get_spdx_licensing()
55  self.scannerscanner = scanner
56  self.report_files: dict[str, File] = {}
57  self.license_package_set: set[str] = set()
58  self.package_verification_set: dict[str, dict[str, list[str]]] = {}
59  self.creation_info: CreationInfo = CreationInfo(
60  spdx_version="SPDX-2.3",
61  spdx_id="SPDXRef-DOCUMENT",
62  name="FOSSology CI Report",
63  data_license="CC0-1.0",
64  document_namespace="https://fossology.org",
65  creators=[
66  Actor(
67  ActorType.ORGANIZATION, "FOSSology",
68  "fossology@fossology.org"
69  )
70  ], created=datetime.now(),
71  )
72  self.document: Document = Document(self.creation_info)
73 
74  parent_package = self.scannerscanner.get_scan_packages().parent_package
75  project_name = parent_package.get('name', '').strip()
76  if not project_name:
77  project_name = self.cli_optionscli_options.parser.root_component_name
78  if not project_name:
79  project_name = "NA"
80 
81  self.package: Package = Package(
82  name=project_name,
83  spdx_id="SPDXRef-Package",
84  files_analyzed=True,
85  download_location=SpdxNoAssertion(),
86  release_date=datetime.now(),
87  )
88  if parent_package.get('description') is not None:
89  self.package.description = parent_package['description']
90 
91  author = parent_package.get('author')
92  if author and author != "":
93  self.package.originator = Actor(
94  ActorType.ORGANIZATION,
95  author
96  )
97  else:
98  self.package.originator = SpdxNoAssertion()
99 
100  url = parent_package.get('url')
101  if url and url != "":
102  self.package.download_location = url
103  else:
104  self.package.download_location = SpdxNoAssertion()
105 
106  self.document.packages = [self.package]
107  self.dependent_packages: dict[str, Package] = {}
108  self.extracted_licenses: dict[str, ExtractedLicensingInfo] = {}
109 
110  def __get_license_or_ref(self, lic: str) -> LicenseExpression:
111  if lic in self._license_cache:
112  return self._license_cache[lic]
113  license_spdx = lic
114  if self._spdx_lic_cache_spdx_lic_cache.validate(lic).invalid_symbols:
115  license_spdx = re.sub(
116  r'[^\da-zA-Z.\-_]', '-',
117  f"LicenseRef-fossology-{lic}"
118  )
119  if license_spdx not in self.extracted_licenses:
120  self.extracted_licenses[license_spdx] = ExtractedLicensingInfo(
121  license_id=license_spdx,
122  license_name=lic,
123  extracted_text=f"The license text for {license_spdx} has to be "
124  "entered."
125  )
126  lic_expression = self._spdx_lic_cache_spdx_lic_cache.parse(license_spdx)
127  self._license_cache[lic] = lic_expression
128  return lic_expression
129 
130  def __add_license_file(self, package: Package, scan_result: ScanResultList):
131  """
132  Add scan result from license scanner to report.
133 
134  :param package: Package to which the file belongs.
135  :param scan_result: Scan result from license scanner.
136  """
137  raw_licenses_strings = [lic['license'] for lic in scan_result.result]
138  parsed_expressions_set = {self.__get_license_or_ref__get_license_or_ref(lic_str) for lic_str in
139  raw_licenses_strings}
140  parsed_expressions_list = list(parsed_expressions_set)
141 
142  all_allowed_licenses = all(
143  lic_str in self._allowed_licenses_set_allowed_licenses_set for lic_str in raw_licenses_strings
144  )
145 
146  file = self.__get_spdx_file__get_spdx_file(scan_result, package)
147 
148  if all_allowed_licenses:
149  file.license_concluded = combine_expressions(
150  expressions=parsed_expressions_list, relation='AND', unique=False
151  )
152  else:
153  file.license_concluded = SpdxNoAssertion()
154 
155  file.license_info_in_file = parsed_expressions_list
156 
157  # Update licenses found in the files of the package
158  package.license_info_from_files = list(
159  set(package.license_info_from_files) | parsed_expressions_set
160  )
161 
162  # Update package.license_concluded
163  if file.license_concluded != SpdxNoAssertion():
164  if package.license_concluded in (SpdxNoAssertion(), SpdxNone()):
165  package.license_concluded = file.license_concluded
166  else:
167  package.license_concluded = (
168  package.license_concluded & file.license_concluded).simplify()
169 
171  self, scan_result: ScanResultList, package: Package
172  ) -> File:
173  """
174  Create a new SPDX File for given scan result and populate common fields.
175 
176  :param scan_result: Scan result from scanner.
177  :param package: Package to which the file belongs.
178  :return: New SPDX File
179  """
180  md5_hash, sha1_hash, sha256_hash = self.__get_file_info__get_file_info(scan_result)
181  file_spdx_id = self.__get_file_spdx_id__get_file_spdx_id(sha256_hash, package.name)
182  if file_spdx_id not in self.report_files:
183  spdx_file = File(
184  name=scan_result.file,
185  spdx_id=file_spdx_id,
186  checksums=[
187  Checksum(ChecksumAlgorithm.MD5, md5_hash),
188  Checksum(ChecksumAlgorithm.SHA1, sha1_hash),
189  Checksum(ChecksumAlgorithm.SHA256, sha256_hash),
190  ],
191  file_types=[FileType.SOURCE],
192  license_concluded=SpdxNoAssertion()
193  )
194  self.report_files[file_spdx_id] = spdx_file
195  contains_relationship = Relationship(
196  package.spdx_id,
197  RelationshipType.CONTAINS,
198  file_spdx_id
199  )
200  self.document.relationships.append(contains_relationship)
201 
202  pkg_verification_data = self.package_verification_set.setdefault(
203  package.spdx_id, {
204  'checksums': [], 'excluded_files': []
205  }
206  )
207 
208  if self.scannerscanner.is_excluded_path(spdx_file.name):
209  pkg_verification_data['excluded_files'].append(spdx_file.name)
210  else:
211  pkg_verification_data['checksums'].append(sha1_hash)
212 
213  return self.report_files[file_spdx_id]
214 
216  self, package: Package, copyright_result: ScanResultList
217  ):
218  """
219  Add scan result from copyright agent. If the file does not exist, creates a
220  new one.
221 
222  :param copyright_result: Scan result from copyright scanner.
223  """
224  file = self.__get_spdx_file__get_spdx_file(copyright_result, package)
225  file.copyright_text = "\n".join(
226  [
227  cpy.get('content', '') for cpy in copyright_result.result
228  ]
229  )
230 
231  @staticmethod
232  def __get_file_info(scan_result: ScanResultList) -> tuple[str, str, str]:
233  """
234  Get different hash for the file in scan result.
235 
236  :param scan_result: Scan result from scanners.
237  :return: Tuple of md5, sha1 and sha256 checksums.
238  """
239  md5_hash = hashlib.md5()
240  sha1_hash = hashlib.sha1()
241  sha256_hash = hashlib.sha256()
242  with open(scan_result.path, "rb") as f:
243  for byte_block in iter(lambda: f.read(4096), b""):
244  md5_hash.update(byte_block)
245  sha1_hash.update(byte_block)
246  sha256_hash.update(byte_block)
247  return md5_hash.hexdigest(), sha1_hash.hexdigest(), sha256_hash.hexdigest()
248 
249  @staticmethod
250  def __get_file_spdx_id(sha256_hash: str, pkg_name: str) -> str:
251  """
252  Generate SPDX ID for file in scan result.
253 
254  :param sha256_hash: SHA 256 checksum of the file
255  :param pkg_name: Package to which the file belongs.
256  :return: SPDX ID for the file.
257  """
258  return f"SPDXRef-File-{pkg_name}-{sha256_hash}"
259 
260  @staticmethod
261  def __get_package_spdx_id(component: dict) -> str:
262  """
263  Generate SPDX ID for a package/component.
264 
265  :param component: Package/component to get SPDX ID for.
266  :return: SPDX ID for the package.
267  """
268  pkg_name = component.get('name', '')
269  pkg_version = component.get('version', '')
270  return "SPDXRef-Package-" + hashlib.md5(
271  f"{pkg_name}_{pkg_version}".encode('utf-8', errors='ignore')
272  ).hexdigest()
273 
274  def write_report(self, file_name: str):
275  """
276  Validate the document and write the SPDX file.
277 
278  :param file_name: Location to store the report.
279  """
280  validation_messages: list[ValidationMessage] = validate_full_spdx_document(
281  self.document
282  )
283 
284  if validation_messages:
285  for message in validation_messages:
286  logging.warning(
287  f"SPDX Validation Warning: {message.validation_message}\n"
288  f"Context: {message.context}"
289  )
290  raise RuntimeError(
291  "SPDX document validation failed. See logs for details."
292  )
293  else:
294  logging.info("SPDX document validated successfully.")
295 
296  # validate=False here as we validated above
297  write_file(
298  self.document, file_name, validate=False
299  )
300 
301  def finalize_document(self):
302  """
303  Finalize the document by setting relations between packages and files.
304  At the same time, add all the licenses from files to the package and
305  calculate the verification code, without the excluded files.
306  """
307  self.__create_packages__create_packages()
308  self.__create_license_files__create_license_files()
309  self.__create_copyright_files__create_copyright_files()
310  self.__add_files_to_document__add_files_to_document()
311  self.__add_extracted_licenses__add_extracted_licenses()
312  self.__update_package_verification_code__update_package_verification_code()
313 
314  def __create_packages(self) -> None:
315  parent_name = self.scannerscanner.get_scan_packages().parent_package.get(
316  'name', ''
317  )
318  if parent_name:
319  self.package.spdx_id = re.sub(
320  r'[^A-Za-z0-9\-_.]', '-',
321  f"SPDXRef-Package-{parent_name}"
322  )
323  describes_relationship = Relationship(
324  "SPDXRef-DOCUMENT",
325  RelationshipType.DESCRIBES,
326  self.package.spdx_id
327  )
328  self.document.relationships.append(describes_relationship)
329 
330  for purl, component in (
331  self.scannerscanner.get_scan_packages().dependencies.items()
332  ):
333  package = self.__get_package_for_component__get_package_for_component(component)
334  self.document.packages.append(package)
335  depends_on_relationship = Relationship(
336  self.package.spdx_id,
337  RelationshipType.DEPENDS_ON,
338  package.spdx_id
339  )
340  self.document.relationships.append(depends_on_relationship)
341 
342  def __get_package_for_component(self, component: dict) -> Package:
343  """
344  For a given component, create a package and add it to the list.
345 
346  :param component: Component to create package for.
347  :return: Create or get existing package.
348  """
349  pkg_spdx_id = self.__get_package_spdx_id__get_package_spdx_id(component)
350  if pkg_spdx_id not in self.dependent_packages:
351  self.dependent_packages[pkg_spdx_id] = Package(
352  spdx_id=pkg_spdx_id,
353  name=component.get('name', 'UNKNOWN'),
354  version=component.get('version', 'UNKNOWN'),
355  download_location=component.get(
356  'fossology_download_url', SpdxNoAssertion()
357  ),
358  license_info_from_files=[],
359  license_concluded=SpdxNone(),
360  files_analyzed=True
361  )
362  purl_ref = ExternalPackageRef(
363  category=ExternalPackageRefCategory.PACKAGE_MANAGER,
364  reference_type='purl',
365  locator=component.get('purl')
366  )
367  self.dependent_packages[pkg_spdx_id].external_references.append(purl_ref)
368 
369  vcs_url = component.get('vcs_url')
370  if vcs_url:
371  vcs_ref = ExternalPackageRef(
372  category=ExternalPackageRefCategory.OTHER, reference_type='vcs',
373  locator=vcs_url
374  )
375  self.dependent_packages[pkg_spdx_id].external_references.append(vcs_ref)
376 
377  homepage_url = component.get('homepage_url')
378  if homepage_url:
379  homepage_ref = ExternalPackageRef(
380  category=ExternalPackageRefCategory.OTHER, reference_type='homepage',
381  locator=homepage_url
382  )
383  self.dependent_packages[pkg_spdx_id].external_references.append(
384  homepage_ref
385  )
386  return self.dependent_packages[pkg_spdx_id]
387 
388  def __create_license_files(self) -> None:
389  self.__create_license_file_from_component__create_license_file_from_component(
390  self.scannerscanner.get_scan_packages().parent_package, self.package
391  )
392  for component in self.scannerscanner.get_scan_packages().dependencies.values():
393  self.__create_license_file_from_component__create_license_file_from_component(
394  component, self.__get_package_for_component__get_package_for_component(
395  component
396  )
397  )
398 
399  def __create_copyright_files(self) -> None:
400  self.__create_copyright_file_from_component__create_copyright_file_from_component(
401  self.scannerscanner.get_scan_packages().parent_package, self.package
402  )
403  for component in self.scannerscanner.get_scan_packages().dependencies.values():
404  self.__create_copyright_file_from_component__create_copyright_file_from_component(
405  component, self.__get_package_for_component__get_package_for_component(
406  component
407  )
408  )
409 
410  def __create_license_file_from_component(
411  self, component: dict, package: Package
412  ) -> None:
413  for result in component.get('SCANNER_RESULTS', []):
414  self.__add_license_file__add_license_file(package, result)
415 
416  def __create_copyright_file_from_component(
417  self, component: dict, package: Package
418  ) -> None:
419  for result in component.get('COPYRIGHT_RESULT', []):
420  self.__add_copyright_file__add_copyright_file(package, result)
421 
422  def __add_files_to_document(self) -> None:
423  self.document.files = list(self.report_files.values())
424 
425  def __add_extracted_licenses(self) -> None:
426  self.document.extracted_licensing_info = list(
427  self.extracted_licenses.values()
428  )
429 
430  def __update_package_verification_code(self) -> None:
431  for package in self.document.packages:
432  code = self.__calculate_verification_code__calculate_verification_code(package.spdx_id)
433  if code is not None:
434  package.verification_code = code
435 
437  self, package_spdx_id: str
438  ) -> PackageVerificationCode | None:
439  """
440  Calculate package verification code for the list of checksums and return it.
441 
442  :param package_spdx_id: Package SPDX ID to calculate the verification
443  code for.
444  :return: Package Verification Code based on SPDX specification.
445  """
446  pkg_data = self.package_verification_set.get(package_spdx_id)
447  if not pkg_data:
448  return None
449 
450  checksums = pkg_data.get('checksums', [])
451  excluded_files = pkg_data.get('excluded_files', [])
452 
453  checksums.sort()
454 
455  verification_code = hashlib.sha1(
456  "".join(checksums).encode('utf-8', errors='ignore')
457  ).hexdigest()
458 
459  return PackageVerificationCode(
460  value=verification_code,
461  excluded_files=excluded_files
462  )
None __update_package_verification_code(self)
Definition: SpdxReport.py:430
def __init__(self, CliOptions cli_options, Scanners scanner)
Definition: SpdxReport.py:44
File __get_spdx_file(self, ScanResultList scan_result, Package package)
Definition: SpdxReport.py:172
None __create_copyright_file_from_component(self, dict component, Package package)
Definition: SpdxReport.py:418
def __add_copyright_file(self, Package package, ScanResultList copyright_result)
Definition: SpdxReport.py:217
Package __get_package_for_component(self, dict component)
Definition: SpdxReport.py:342
LicenseExpression __get_license_or_ref(self, str lic)
Definition: SpdxReport.py:110
None __create_license_file_from_component(self, dict component, Package package)
Definition: SpdxReport.py:412
def __add_license_file(self, Package package, ScanResultList scan_result)
Definition: SpdxReport.py:130
tuple[str, str, str] __get_file_info(ScanResultList scan_result)
Definition: SpdxReport.py:232
str __get_package_spdx_id(dict component)
Definition: SpdxReport.py:261
PackageVerificationCode|None __calculate_verification_code(self, str package_spdx_id)
Definition: SpdxReport.py:438
str __get_file_spdx_id(str sha256_hash, str pkg_name)
Definition: SpdxReport.py:250
def write_report(self, str file_name)
Definition: SpdxReport.py:274
list_t type structure used to keep various lists. (e.g. there are multiple lists).
Definition: nomos.h:308