FOSSology  4.5.1
Open Source License Compliance by Open Source Software
Downloader.py
1 #!/usr/bin/env python3
2 
3 # SPDX-FileContributor: © Rajul Jha <rajuljha49@gmail.com>
4 # SPDX-FileContributor: Gaurav Mishra <mishra.gaurav@siemens.com>
5 # SPDX-FileCopyrightText: © 2025 Siemens AG
6 
7 # SPDX-License-Identifier: GPL-2.0-only
8 
9 import concurrent.futures
10 import logging
11 import os
12 import tarfile
13 import threading
14 import urllib.parse
15 import zipfile
16 
17 import requests
18 
19 from .Parsers import DOWNLOAD_URL_KEY, Parser
20 
21 
22 class Downloader:
23  """
24  Class for parallely downloading dependencies from download urls.
25  """
26 
27  def __init__(self):
28  self.parserparser = None
29  self.locklock = threading.Lock()
30  self.download_timeoutdownload_timeout = 30 # Seconds for network requests
31 
32  def __get_archive_base_dir(self, archive_path: str) -> str:
33  """
34  Determines the base directory within an extracted archive.
35  Assumes a common pattern where archives may contain a single top-level
36  directory or none.
37  """
38  base_dir = ""
39  try:
40  if zipfile.is_zipfile(archive_path):
41  with zipfile.ZipFile(archive_path, 'r') as zip_ref:
42  members = zip_ref.namelist()
43  if members:
44  # Check if all members start with a common prefix (single root
45  # directory)
46  first_part = members[0].split(os.sep)[0]
47  if all(
48  m.startswith(first_part + os.sep) or m == first_part for m in
49  members
50  ):
51  base_dir = first_part
52  # else: archive extracts directly into
53  # package_folder or has multiple roots
54  elif tarfile.is_tarfile(archive_path):
55  with tarfile.open(archive_path, 'r:*') as tar_ref:
56  members = tar_ref.getnames()
57  if members:
58  first_part = members[0].split(os.sep)[0]
59  if all(
60  m.startswith(first_part + os.sep) or m == first_part for m in
61  members
62  ):
63  base_dir = first_part
64  # else: archive extracts directly into
65  # package_folder or has multiple roots
66  except (
67  zipfile.BadZipFile, tarfile.ReadError, tarfile.FilterError, IOError
68  ) as e:
69  logging.warning(
70  f"Could not inspect archive {archive_path} for base directory: {e}"
71  )
72  return base_dir
73 
74  def __download_package(self, component: dict) -> None:
75  download_url = component.get(DOWNLOAD_URL_KEY)
76  if not download_url:
77  logging.warning(
78  "No download URL found for component: "
79  f"{component.get('name', 'N/A')}. Skipping."
80  )
81  return
82 
83  package_name = component.get('name', 'unknown_package')
84  package_folder = component.get('download_dir')
85 
86  if not package_folder:
87  logging.error(
88  f"Download directory not specified for {package_name}. Skipping."
89  )
90  return
91 
92  os.makedirs(package_folder, exist_ok=True)
93 
94  parsed_url = urllib.parse.urlparse(download_url)
95  filename = os.path.basename(parsed_url.path)
96  if not filename: # Fallback if basename is empty (e.g., URL ends with /)
97  filename = f"{package_name}_download"
98 
99  # Try to determine a common archive extension or use a generic one
100  # This list is more specific for common archive types
101  archive_extensions = [
102  '.tar.gz', '.tgz', '.tar.bz2', '.tbz', '.tar.xz', '.txz', '.zip', '.whl',
103  '.tar'
104  ]
105  file_extension = ''
106  for ext in archive_extensions:
107  if filename.lower().endswith(ext):
108  file_extension = ext
109  break
110  if not file_extension:
111  # Fallback to simple splitext if no known archive extension is found
112  _, file_extension = os.path.splitext(filename)
113  if not file_extension: # Ensure there's at least some extension
114  file_extension = '.bin' # Unable to determine. Ignore
115 
116  file_path = os.path.join(package_folder, f"{package_name}{file_extension}")
117 
118  temp_archive_path = file_path # Use this path for download and extraction
119 
120  try:
121  logging.info(
122  f"Downloading {package_name} from {download_url} to {temp_archive_path}"
123  )
124  response = requests.get(
125  download_url, stream=True, timeout=self.download_timeoutdownload_timeout
126  )
127  response.raise_for_status() # Raise HTTPError for bad responses (4xx
128  # or 5xx)
129 
130  with open(temp_archive_path, 'wb') as f:
131  for chunk in response.iter_content(chunk_size=8192):
132  f.write(chunk)
133  logging.info(f"Downloaded {package_name} to {temp_archive_path}")
134 
135  if temp_archive_path.lower().endswith('.zip'):
136  with zipfile.ZipFile(temp_archive_path, 'r') as zip_ref:
137  zip_ref.extractall(package_folder)
138  base_dir = self.__get_archive_base_dir__get_archive_base_dir(
139  temp_archive_path
140  )
141  elif temp_archive_path.lower().endswith(
142  ('.tar.gz', '.tgz', '.tar.bz2', '.tbz', '.tar.xz', '.txz', '.tar')
143  ):
144  with tarfile.open(temp_archive_path, 'r:*') as tar_ref:
145  tar_ref.extractall(package_folder)
146  base_dir = self.__get_archive_base_dir__get_archive_base_dir(
147  temp_archive_path
148  )
149  else:
150  logging.warning(
151  f"Unsupported file format for extraction: {file_extension} for "
152  f"{package_name}. File downloaded but not extracted."
153  )
154  return
155 
156  # Update base_dir in parser's components, ensuring thread safety
157  purl = component.get('purl')
158  if purl and self.parserparser and purl in self.parserparser.parsed_components:
159  with self.locklock:
160  self.parserparser.parsed_components[purl]['base_dir'] = base_dir
161 
162  logging.info(
163  f"Exported {package_name} to {package_folder} (base_dir: '{base_dir}')"
164  )
165 
166  except requests.exceptions.Timeout:
167  logging.error(
168  f"Timeout occurred while downloading {package_name} from {download_url}"
169  )
170  except requests.exceptions.HTTPError as e:
171  logging.error(
172  f"HTTP error {e.response.status_code} while downloading "
173  f"{package_name} from {download_url}: {e}"
174  )
175  except requests.exceptions.RequestException as e:
176  logging.error(
177  f"Network error while downloading {package_name} from {download_url}: "
178  f"{e}"
179  )
180  except (zipfile.BadZipFile, tarfile.ReadError, tarfile.FilterError) as e:
181  logging.error(
182  f"Error extracting archive for {package_name} from "
183  f"{temp_archive_path}: {e}"
184  )
185  except IOError as e:
186  logging.error(
187  f"File I/O error during download or extraction for {package_name}: {e}"
188  )
189  except Exception as e:
190  logging.error(
191  f"An unexpected error occurred during download or extraction for "
192  f"{package_name}: {e}"
193  )
194  finally:
195  # Clean up the downloaded archive file
196  if os.path.exists(temp_archive_path):
197  try:
198  os.remove(temp_archive_path)
199  except OSError as e:
200  logging.warning(
201  f"Could not remove temporary archive file {temp_archive_path}: {e}"
202  )
203 
204  def download_concurrently(self, parser: Parser):
205  """
206  Download files concurrently from a list of urls
207  """
208  self.parserparser = parser
209 
210  download_list = [
211  component for component in parser.parsed_components.values()
212  if component.get(DOWNLOAD_URL_KEY, None)
213  ]
214 
215  if not download_list:
216  logging.info("No packages with download URLs found to download.")
217  return "0 packages downloaded."
218 
219  logging.info(
220  f"Attempting to download {len(download_list)} packages concurrently..."
221  )
222 
223  with concurrent.futures.ThreadPoolExecutor(
224  max_workers=os.cpu_count() or 4
225  ) as executor:
226  futures = [
227  executor.submit(self.__download_package__download_package, comp)
228  for comp in download_list
229  ]
230 
231  for future in concurrent.futures.as_completed(futures):
232  try:
233  future.result()
234  except Exception as e:
235  logging.error(f"Error downloading package: {e}")
236 
237  logging.info(
238  f"Finished concurrent download process for {len(download_list)} packages."
239  )
240  return f"{len(download_list)} packages downloaded."
def download_concurrently(self, Parser parser)
Definition: Downloader.py:204
None __download_package(self, dict component)
Definition: Downloader.py:74
str __get_archive_base_dir(self, str archive_path)
Definition: Downloader.py:32