FOSSology  4.5.1
Open Source License Compliance by Open Source Software
Parsers.py
1 #!/usr/bin/env python3
2 
3 # SPDX-FileContributor: © Rajul Jha <rajuljha49@gmail.com>
4 # SPDX-FileContributor: Gaurav Mishra <mishra.gaurav@siemens.com>
5 # SPDX-FileCopyrightText: © 2025 Siemens AG
6 
7 # SPDX-License-Identifier: GPL-2.0-only
8 
9 import json
10 import logging
11 import os
12 from typing import Any
13 
14 import requests
15 from packageurl import PackageURL
16 from packageurl.contrib import purl2url
17 
18 DOWNLOAD_URL_KEY = 'fossology_download_url'
19 COMPONENT_TYPE_KEY = 'fossology_component_type'
20 
21 
22 class Parser:
23  """
24  Parser to classify each component based on it's type.
25  Ex: If purl is pkg:pypi/django@1.11.1,
26  it is a pypi package and should belong to python_components.
27  """
28 
29  def __init__(self, sbom_file: str):
30  """
31  Initialize components list and load the sbom_data.
32  Args:
33  sbom_file: str | Path to sbom file
34  """
35  try:
36  with open(sbom_file, 'r', encoding='utf-8') as file:
37  self.sbom_datasbom_data = json.load(file)
38  except FileNotFoundError as e:
39  logging.error(f"SBOM file not found: {sbom_file}")
40  raise e
41  except json.JSONDecodeError as e:
42  logging.error(f"Invalid JSON in SBOM file: {sbom_file}")
43  raise e
44  except Exception as e:
45  logging.error(
46  f"An unexpected error occurred while reading SBOM file {sbom_file}: {e}"
47  )
48  raise e
49 
50  self.root_component_nameroot_component_name = None
51  self.parsed_components: dict[str, dict[str, Any]] = {}
52 
53  def classify_components(self, root_download_dir: str):
54  """
55  Classify components based on it's type
56 
57  :param root_download_dir: Download dir prefix. Will be used to create
58  download dir.
59  """
60  self.root_component_nameroot_component_name = (
61  self.sbom_datasbom_data.get('metadata', {})
62  .get('component', {}).get('name', None))
63  for component in self.sbom_datasbom_data.get('components', []):
64  purl = component.get('purl', '')
65  if not purl:
66  continue
67  comp_type = self._extract_type_extract_type(purl)
68 
69  # Ensure component has 'name' and 'version' for path construction
70  comp_name = component.get('name', 'unknown_name')
71  comp_version = component.get('version', 'unknown_version')
72 
73  component['download_dir'] = os.path.join(
74  root_download_dir, comp_type or 'unclassified', comp_name, comp_version
75  )
76  component[COMPONENT_TYPE_KEY] = comp_type
77  self.parsed_components[purl] = component
78 
79  def _extract_type(self, purl: str) -> str | None:
80  """
81  Extracts the package type from the purl.
82  Example purl: pkg:pypi/django@1.11.1
83  The type here is 'pypi'.
84  Args:
85  purl: str | Purl of the package to scan
86  Return:
87  purl_type: str | Type of component or None
88  """
89  # purl format: pkg:type/namespace/name@version?qualifiers#subpath
90  try:
91  if purl.startswith("pkg:"):
92  parsed_purl = PackageURL.from_string(purl)
93  return parsed_purl.type
94  return None
95  except ValueError as e:
96  logging.warning(f"Could not parse PURL '{purl}': {e}")
97  return None
98  except Exception as e:
99  logging.error(
100  "An unexpected error occurred while extracting PURL type for '"
101  f"{purl}': {e}"
102  )
103  return None
104 
105  @property
106  def python_components(self) -> list[dict[str, Any]]:
107  return [comp for comp in self.parsed_components.values() if
108  comp.get(COMPONENT_TYPE_KEY) == 'pypi']
109 
110  @property
111  def npm_components(self) -> list[dict[str, Any]]:
112  return [comp for comp in self.parsed_components.values() if
113  comp.get(COMPONENT_TYPE_KEY) == 'npm']
114 
115  @property
116  def php_components(self) -> list[dict[str, Any]]:
117  return [comp for comp in self.parsed_components.values() if
118  comp.get(COMPONENT_TYPE_KEY) == 'composer']
119 
120  @property
121  def unsupported_components(self) -> list[dict[str, Any]]:
122  return [comp for comp in self.parsed_components.values() if
123  comp.get(COMPONENT_TYPE_KEY) not in ['pypi', 'npm', 'composer']]
124 
125 
127  """
128  Python Parser to parse the python sboms to generate download urls from
129  cyclonedx format sbom files.
130  """
131 
132  PYPI_BINARY_DIST_WHEEL = 'bdist_wheel'
133  PYPI_SOURCE_DIST = 'sdist'
134 
135  def _generate_api_endpoint(self, package_name: str, version: str) -> str:
136  """
137  Generate JSON REST API Endpoint to fetch download url.
138  Args:
139  package_name: str Name of package
140  version: str Version of package
141  Return:
142  JSON REST API endpoint tp fetch metadata of package
143  """
144  return f"https://pypi.org/pypi/{package_name}/{version}/json"
145 
146  def parse_components(self, parser: Parser) -> None:
147  """
148  Parse SBOM file for package name and download url of package.
149  Return:
150  None
151  """
152  for comp in parser.python_components:
153  purl = comp.get('purl')
154  if not purl:
155  logging.warning(f"Python component missing PURL: {comp}. Skipping.")
156  continue
157 
158  component = parser.parsed_components.get(purl)
159  if not component:
160  logging.warning(
161  f"Component with PURL {purl} not found in parsed_components. "
162  "Skipping."
163  )
164  continue
165 
166  package_name = component.get('name')
167  version = component.get('version')
168  if not package_name or not version:
169  logging.warning(
170  f"Python component {purl} missing name or version. Skipping."
171  )
172  continue
173 
174  api_endpoint = self._generate_api_endpoint_generate_api_endpoint(package_name, version)
175  logging.info(f"API endpoint for {package_name} : {api_endpoint}")
176 
177  try:
178  response = requests.get(
179  api_endpoint, timeout=10
180  )
181  response.raise_for_status() # Raise an exception for HTTP errors (
182  # 4xx or 5xx)
183 
184  data = response.json()
185  sdist_url = None
186  wheel_url = None
187 
188  for url_info in data.get('urls', []):
189  if url_info.get('packagetype') == self.PYPI_SOURCE_DISTPYPI_SOURCE_DIST:
190  sdist_url = url_info.get('url')
191  elif url_info.get('packagetype') == self.PYPI_BINARY_DIST_WHEELPYPI_BINARY_DIST_WHEEL:
192  wheel_url = url_info.get('url')
193 
194  # Prefer sdist, fallback to wheel if sdist is not available
195  download_url = sdist_url if sdist_url else wheel_url
196  if download_url:
197  component[DOWNLOAD_URL_KEY] = download_url
198  else:
199  logging.warning(
200  f"No suitable download URL found for {package_name} {version}"
201  )
202 
203  # Extract VCS and homepage URLs
204  project_urls = data.get('info', {}).get('project_urls', {})
205  for key, value in project_urls.items():
206  if "source" in key.lower():
207  component['vcs_url'] = value
208  if "homepage" in key.lower():
209  component['homepage_url'] = value
210 
211  except requests.exceptions.RequestException as e:
212  logging.error(
213  f"Failed to retrieve data for {package_name} {version} from "
214  f"{api_endpoint}: {e}"
215  )
216  except json.JSONDecodeError:
217  logging.error(
218  f"Failed to decode JSON response from {api_endpoint} for "
219  f"{package_name} {version}."
220  )
221  except Exception as e:
222  logging.error(
223  f"An unexpected error occurred while parsing Python component "
224  f"{purl}: {e}"
225  )
226 
227 
228 class NPMParser:
229  """
230  NPM Parser to parse the python sboms to generate download urls from
231  cyclonedx format sbom files.
232  """
233 
234  def _get_download_url(self, purl: str) -> str:
235  """
236  Get download url from purl for NPM Packages
237  Args:
238  purl: str
239  Return:
240  download_url: str
241  """
242  return purl2url.get_download_url(purl)
243 
244  def parse_components(self, parser: Parser) -> None:
245  """
246  Parse the components to extract the tuple of (<package_name>,
247  <download_url>)
248  Return:
249  None
250  """
251  for comp in parser.npm_components:
252  purl = comp.get('purl')
253  if not purl:
254  logging.warning(f"NPM component missing PURL: {comp}. Skipping.")
255  continue
256 
257  component = parser.parsed_components.get(purl)
258  if not component:
259  logging.warning(
260  f"Component with PURL {purl} not found in parsed_components. "
261  f"Skipping."
262  )
263  continue
264 
265  name = component.get('name', 'unknown_name')
266  try:
267  download_url = self._get_download_url_get_download_url(purl)
268  component[DOWNLOAD_URL_KEY] = download_url
269  except Exception as e:
270  logging.error(
271  f"Invalid Download URL for NPM package: {name} ({purl}) :: {e}"
272  )
None parse_components(self, Parser parser)
Definition: Parsers.py:244
str _get_download_url(self, str purl)
Definition: Parsers.py:234
def __init__(self, str sbom_file)
Definition: Parsers.py:29
str|None _extract_type(self, str purl)
Definition: Parsers.py:79
def classify_components(self, str root_download_dir)
Definition: Parsers.py:53
None parse_components(self, Parser parser)
Definition: Parsers.py:146
str _generate_api_endpoint(self, str package_name, str version)
Definition: Parsers.py:135