FOSSology  4.5.1
Open Source License Compliance by Open Source Software
FormatResults.py
1 #!/usr/bin/env python3
2 
3 # SPDX-FileContributor: © Rajul Jha <rajuljha49@gmail.com>
4 # SPDX-FileContributor: Gaurav Mishra <mishra.gaurav@siemens.com>
5 # SPDX-FileCopyrightText: © 2025 Siemens AG
6 
7 # SPDX-License-Identifier: GPL-2.0-only
8 
9 import os
10 import re
11 from bisect import bisect_right
12 from typing import Any
13 
14 from .CliOptions import CliOptions
15 
16 
18  """
19  For formatting the results from scanners with line number information
20 
21  :ivar cli_options: CliOptions object
22  """
23  cli_options: CliOptions = None
24 
25  _RE_DIFF_HEADER = re.compile(r'^@@ -([0-9]+),([0-9]+) [+]([0-9]+),([0-9]+) @@')
26  _RE_NON_CONTENT_LINE = re.compile(r'^(---|\+\+\+|[^-+ ])')
27 
28 
29  def __init__(self, cli_options: CliOptions):
30  self.cli_optionscli_options = cli_options
31 
32  def format_diff(self, diff_content: str) -> str:
33  """
34  Format the diff content in a particular format with corrected line numbers.
35 
36  :param diff_content: String to format
37  :return: formatted_string
38  """
39  formatted_diff = []
40  diff_lines = diff_content.splitlines()
41  left = right = 0
42  left_num_len = right_num_len = 0
43  for line in diff_lines:
44  match = self._RE_DIFF_HEADER_RE_DIFF_HEADER.match(line)
45  if match:
46  left = int(match.group(1))
47  left_num_len = len(match.group(2))
48  right = int(match.group(3))
49  right_num_len = len(match.group(4))
50  formatted_diff.append(line)
51  continue
52 
53  if self._RE_NON_CONTENT_LINE_RE_NON_CONTENT_LINE.match(line):
54  formatted_diff.append(line)
55  continue
56 
57  # Remove the leading '+', '-', or ' '
58  line_content = line[1:]
59  if line.startswith('-'):
60  padding = ' ' * right_num_len
61  formatted_diff.append(
62  f"-{left:<{left_num_len}} {padding}:{line_content}"
63  )
64  left += 1
65  elif line.startswith('+'):
66  padding = ' ' * left_num_len
67  formatted_diff.append(
68  f"+{padding} {right:<{right_num_len}}:{line_content}"
69  )
70  right += 1
71  else:
72  formatted_diff.append(
73  f" {left:<{left_num_len}} {right:<{right_num_len}}:{line_content}"
74  )
75  left += 1
76  right += 1
77 
78  return "\n".join(formatted_diff)
79 
81  self, diff_string: str, word_start_byte: int, word_end_byte: int
82  ) -> list[Any]:
83  """
84  Find line numbers from formatted diff data
85 
86  :param diff_string: Formatted diff string
87  :param word_start_byte: Start byte of scanner result
88  :param word_end_byte: End byte of scanner result
89  :return: List of line_numbers found for a given word
90  """
91  escaped_word = re.escape(diff_string[word_start_byte:word_end_byte])
92  pattern = re.compile(r'(\d+):.*?' + escaped_word)
93  matches = pattern.findall(diff_string)
94  return matches
95 
97  self, file_path: str, words: list, key: str
98  ) -> dict[str, set[str]] | None:
99  """
100  Find the line number of each word found for a given file path
101 
102  :param file_path: Path of the file to scan
103  :param words: List of words(ScanResult Objects) to be scanned for
104  :param key: Key to scan: 'contents' for copyright and keyword and 'license'
105  for nomos and ojo
106  :return: found_words_with_line_number : dict Dictionary of scanned results
107  with key as scanned word and value as list of line_numbers where
108  it is found.
109  """
110  found_words_with_line_number: dict[str, set[str]] = {}
111  if (self.cli_optionscli_options.repo or self.cli_optionscli_options.scan_only_deps or
112  self.cli_optionscli_options.scan_dir):
113  try:
114  with open(file_path, 'rb') as file:
115  binary_data = file.read()
116  string_data = binary_data.decode('utf-8', errors='ignore')
117 
118  # line_start_offsets will store the byte offset of the first character of each line.
119  # Example: "Hello\nWorld" -> [0, 6]
120  line_start_offsets = [0]
121  for i, char in enumerate(string_data):
122  if char == '\n':
123  line_start_offsets.append(i + 1)
124 
125  for word_info in words:
126  word_start_byte = word_info['start']
127  word_key_value = word_info[key]
128  line_number = bisect_right(line_start_offsets, word_start_byte)
129  found_words_with_line_number.setdefault(word_key_value,
130  set()).add(str(line_number))
131 
132  return found_words_with_line_number
133  except Exception as e:
134  print(f"An error occurred: {e}")
135  return None
136  else:
137  with open(file_path, 'r') as file:
138  content = file.read()
139  for i in range(0, len(words)):
140  line_numbers = self.find_line_numbersfind_line_numbers(
141  content, words[i]['start'], words[i]['end']
142  )
143  found_words_with_line_number[words[i][f'{key}']] = set(line_numbers)
144  return found_words_with_line_number
145 
146  def process_files(self, root_dir: str) -> None:
147  """
148  Format the files according to unified diff format
149 
150  :param root_dir: Path of the temp dir root to format the files
151  :return: None
152  """
153  if (self.cli_optionscli_options.repo or self.cli_optionscli_options.scan_only_deps or
154  self.cli_optionscli_options.scan_dir):
155  return None
156  for root, dirs, files in os.walk(root_dir):
157  for file_name in files:
158  file_path = os.path.join(root, file_name)
159  with open(file_path, 'r', encoding='UTF-8') as file:
160  file_contents = file.read()
161  try:
162  normal_string = file_contents.encode().decode('unicode_escape')
163  except UnicodeDecodeError:
164  normal_string = file_contents
165  formatted_diff = self.format_diffformat_diff(normal_string)
166  with open(file_path, 'w', encoding='utf-8') as file:
167  file.write(formatted_diff)
168  return None
list[Any] find_line_numbers(self, str diff_string, int word_start_byte, int word_end_byte)
None process_files(self, str root_dir)
str format_diff(self, str diff_content)
dict[str, set[str]]|None find_word_line_numbers(self, str file_path, list words, str key)
Store the results of a regex match.
Definition: scanners.hpp:28