11 from bisect
import bisect_right
12 from typing
import Any
14 from .CliOptions
import CliOptions
19 For formatting the results from scanners with line number information
21 :ivar cli_options: CliOptions object
23 cli_options: CliOptions =
None
25 _RE_DIFF_HEADER = re.compile(
r'^@@ -([0-9]+),([0-9]+) [+]([0-9]+),([0-9]+) @@')
26 _RE_NON_CONTENT_LINE = re.compile(
r'^(---|\+\+\+|[^-+ ])')
29 def __init__(self, cli_options: CliOptions):
34 Format the diff content in a particular format with corrected line numbers.
36 :param diff_content: String to format
37 :return: formatted_string
40 diff_lines = diff_content.splitlines()
42 left_num_len = right_num_len = 0
43 for line
in diff_lines:
46 left = int(match.group(1))
47 left_num_len = len(match.group(2))
48 right = int(match.group(3))
49 right_num_len = len(match.group(4))
50 formatted_diff.append(line)
54 formatted_diff.append(line)
58 line_content = line[1:]
59 if line.startswith(
'-'):
60 padding =
' ' * right_num_len
61 formatted_diff.append(
62 f
"-{left:<{left_num_len}} {padding}:{line_content}"
65 elif line.startswith(
'+'):
66 padding =
' ' * left_num_len
67 formatted_diff.append(
68 f
"+{padding} {right:<{right_num_len}}:{line_content}"
72 formatted_diff.append(
73 f
" {left:<{left_num_len}} {right:<{right_num_len}}:{line_content}"
78 return "\n".join(formatted_diff)
81 self, diff_string: str, word_start_byte: int, word_end_byte: int
84 Find line numbers from formatted diff data
86 :param diff_string: Formatted diff string
87 :param word_start_byte: Start byte of scanner result
88 :param word_end_byte: End byte of scanner result
89 :return: List of line_numbers found for a given word
91 escaped_word = re.escape(diff_string[word_start_byte:word_end_byte])
92 pattern = re.compile(
r'(\d+):.*?' + escaped_word)
93 matches = pattern.findall(diff_string)
97 self, file_path: str, words: list, key: str
98 ) -> dict[str, set[str]] |
None:
100 Find the line number of each word found for a given file path
102 :param file_path: Path of the file to scan
103 :param words: List of words(ScanResult Objects) to be scanned for
104 :param key: Key to scan: 'contents' for copyright and keyword and 'license'
106 :return: found_words_with_line_number : dict Dictionary of scanned results
107 with key as scanned word and value as list of line_numbers where
110 found_words_with_line_number: dict[str, set[str]] = {}
114 with open(file_path,
'rb')
as file:
115 binary_data = file.read()
116 string_data = binary_data.decode(
'utf-8', errors=
'ignore')
120 line_start_offsets = [0]
121 for i, char
in enumerate(string_data):
123 line_start_offsets.append(i + 1)
125 for word_info
in words:
126 word_start_byte = word_info[
'start']
127 word_key_value = word_info[key]
128 line_number = bisect_right(line_start_offsets, word_start_byte)
129 found_words_with_line_number.setdefault(word_key_value,
130 set()).add(str(line_number))
132 return found_words_with_line_number
133 except Exception
as e:
134 print(f
"An error occurred: {e}")
137 with open(file_path,
'r')
as file:
138 content = file.read()
139 for i
in range(0, len(words)):
141 content, words[i][
'start'], words[i][
'end']
143 found_words_with_line_number[words[i][f
'{key}']] = set(line_numbers)
144 return found_words_with_line_number
148 Format the files according to unified diff format
150 :param root_dir: Path of the temp dir root to format the files
156 for root, dirs, files
in os.walk(root_dir):
157 for file_name
in files:
158 file_path = os.path.join(root, file_name)
159 with open(file_path,
'r', encoding=
'UTF-8')
as file:
160 file_contents = file.read()
162 normal_string = file_contents.encode().decode(
'unicode_escape')
163 except UnicodeDecodeError:
164 normal_string = file_contents
165 formatted_diff = self.
format_diffformat_diff(normal_string)
166 with open(file_path,
'w', encoding=
'utf-8')
as file:
167 file.write(formatted_diff)
Store the results of a regex match.