Source code for atarashi.libs.commentPreprocessor
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Copyright 2018 Aman Jain (amanjain5221@gmail.com)
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
version 2 as published by the Free Software Foundation.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""
import argparse
import code_comment # https://github.com/amanjain97/code_comment/
import os
import re
import string
import tempfile
__author__ = "Aman Jain"
__email__ = "amanjain5221@gmail.com"
args = None
[docs]class CommentPreprocessor(object):
[docs] @staticmethod
def preprocess(data):
'''
- All whitespace should be treated as a single blank space
- All upper case and lower case letters should be treated as lower case letters "(c)", or "Copyright" should be
considered equivalent and interchangeable
- Any hyphen, dash, en dash, em dash, or other variation should be considered equivalent.
- Remove the exceptional characters
:param data: Input file in string format
:return: Pre-process the data according to the rules mentioned above
'''
data = data.lower()
data = re.sub(r'copyright|\(c\)|\u00a9', 'copyright', data)
data = re.sub(r'[{}]'.format(string.punctuation), ' ', data)
data = re.sub(
r'[\u2013\u2014\u2015\u2018\u2019\u201a\u201b\u201c\u201d\u201e\u2026\u2032\u2033]',
'', data)
data = re.sub(r'\s{2,}', ' ', data)
return data.strip()
[docs] @staticmethod
def extract(inputFile):
'''
Extract comments from given input file and return a temp file stored in OS.
This reads all comments from the different files types.
:param inputFile: Location of Input file from which comments needs to be extracted
:return: Temp file path from the OS
'''
fd, outputFile = tempfile.mkstemp()
fileType = inputFile.split('.')[-1]
supportedFileExtensions = ['c', 'cpp', 'py', 'go', 'php', 'js', 'java', 'h',
'hpp', 'cc', 'css', 'html']
# Remove BOM UTF-8 at the beginning of file and ignore errors
file = open(inputFile, mode='r', encoding='utf-8-sig', errors='ignore').read()
open(inputFile, mode='w', encoding='utf-8').write(file)
with open(outputFile, 'w') as outFile:
# if the file extension is supported
if fileType in supportedFileExtensions:
for comment in code_comment.extract(inputFile):
if comment.is_multiline:
outFile.write('\n'.join(comment._body))
else:
outFile.write(''.join(comment._body))
outFile.write('\n')
else:
# if file extension is not supported
with open(inputFile) as inFile:
lines = inFile.read().split('\n')
for line in lines:
outFile.write(line + '\n')
os.close(fd)
return outputFile
if __name__ == "__main__":
print("The file has been run directly")
parser = argparse.ArgumentParser()
parser.add_argument("-p", "--process", required=True,
choices=['preprocess', 'extract'],
help="Which process you want to run")
parser.add_argument("inputFile", help="Specify the input file which needs to be processed")
parser.add_argument("-v", "--verbose", help="increase output verbosity",
action="count", default=0)
args = parser.parse_args()
process = args.process
inputFile = args.inputFile
verbose = args.verbose
if process == "extract":
tempLoc = str(CommentPreprocessor.extract(inputFile))
print("Temporary output file path: ", tempLoc)
if verbose > 0:
print(open(tempLoc, 'r').read())
else:
with open(inputFile) as file:
data = file.read().replace('\n', ' ')
print("Preprocessed data is: ", str(CommentPreprocessor.preprocess(data)))