For my first foray into python, I wrote some parsing code that works as I intended.
I would like to share this code with some other external folk (because the functio it performs might be useful to them), and to be honest, I’m quite ashamed of my clunky and over the top coding… the actual work part of the code is very short, but I seem to have spent about 70% of the file declaring and redeclaring variables….
I’m pretty sure its not the right way of doing this, and I’m not asking for someone to line by line pick apart my coding, but some basic pointers around tidying up the mess would awesome.
And yes, todo is to comment the various section before I push it out….
import re, os
def setGlobals():
###################### match returns
global sigVersionMatch
global filepathMatch
global statusMatch
global puidMatch
global mimeMatch
global status2Match
global warningMatch
global filenameMatch
global fileExtensionMatch
###################### Flags and counters
global lineCounter
global headerFlag
global newLine
###################### header variables
global headerLineOne
global headerLineTwo
global sigVersion
###################### searches as a variable
global SearchForStatus
global SearchForFilename
global SearchForFilepath
global SearchForPuid
global SearchForMime
global SearchForStatus2
global SearchForWarning
global SearchForFileExtension
####################### searchstring variables
global filepath
global status
global puid
global mime
global status2
global warning
global filename
global sigVersion
global fileExtension
###################### file name lists
global newfile
####################### Flag and counters settings
headerFlag = 0
lineCounter = 0
newLine=""
###################### search strings headers
headerLineOne = re.compile(r'(DROID Version,)')
headerLineTwo = re.compile(r'(Status,File,Warning,)')
sigVersion = re.compile(r'(?<=SigFile Version,")[0-9]*')
###################### search strings body
filepath = re.compile(r'(?P<filepath>(F:\\ExLib.*\w))')
status = re.compile(r'(?P<status>(?<!,")(Positive|Not identified|Tentative))')
puid = re.compile(r'(?P<puid>(x-fmt/|fmt/)([0-9]{1,3}))')
mime = re.compile(r'(?P<mime>([a-zA-Z]*\/([a-zA-Z]|\-)*(?=",)))')
status2 = re.compile(r'(?P<status2>(Positive \(Specific Format\)|Positive \(Generic Format\)|(Tentative)(?=(",""))))')
warning = re.compile(r'(?P<warning>(Possible file extension mismatch))')
filename = re.compile(r'(?P<filenam>(V[0-9]\-.*\w))')
headerLineOne = re.compile(r'(DROID Version,)')
headerLineTwo = re.compile(r'(Status,File,Warning,)')
sigVersion = re.compile(r'(?<=SigFile Version,")[0-9]*')
fileExtension = re.compile(r'(?<=\.).*')
def doSearches(line):
global SearchForStatus
global SearchForFilename
global SearchForFilepath
global SearchForPuid
global SearchForMime
global SearchForStatus2
global SearchForWarning
global SearchForFileExtension
global status
global filename
global filepath
global puid
global mime
global status2
global warning
global fileExtension
global filepathMatch
global statusMatch
global puidMatch
global mimeMatch
global status2Match
global warningMatch
global filenameMatch
global fileExtensionMatch
####### do searches
SearchForStatus = status.search(line)
SearchForFilename = filename.search(line)
SearchForFilepath = filepath.search(line)
SearchForPuid = puid.search(line)
SearchForMime = mime.search(line)
SearchForStatus2 = status2.search(line)
SearchForWarning = warning.search(line)
###### convert search returns to strings
if SearchForStatus:
doInitialseVariables()
statusMatch = str(SearchForStatus.group())
if SearchForFilename:
filenameMatch = str(SearchForFilename.group())
SearchForFileExtension = fileExtension.search(filenameMatch)
if SearchForFileExtension:
fileExtensionMatch = str(SearchForFileExtension.group())
if SearchForFilepath:
filepathMatch = str(SearchForFilepath.group())
if SearchForPuid:
puidMatch = str(SearchForPuid.group())
if SearchForMime:
mimeMatch = str(SearchForMime.group())
if SearchForStatus2:
status2Match = str(SearchForStatus2.group())
if SearchForWarning:
warningMatch = str(SearchForWarning.group())
if statusMatch == str("Not identified"):
doBuildLine(line)
if SearchForPuid:
doBuildLine(line)
def doBuildLine(line):
global filepathMatch
global statusMatch
global puidMatch
global mimeMatch
global status2Match
global warningMatch
global filenameMatch
global fileExtensionMatch
global newLine
global lineCounter
global newfile
lineCounter = lineCounter + 1
print lineCounter
newLine = "3,"+ str(sigVersionMatch)+",slow," + str(lineCounter) + ",,," + str(filepathMatch) + "," + str(filenameMatch) + ",," + str(statusMatch) + ",,," + str(fileExtensionMatch) + ",," + str(warningMatch) + ",,," + str(puidMatch) + "," + str(mimeMatch) + ",,\n"
outfile = open(newfile,"a")
outfile.write(newLine)
outfile.close()
def doInitialseVariables():
global filepathMatch
global statusMatch
global puidMatch
global mimeMatch
global status2Match
global warningMatch
global filenamMatch
global lineOnceFlag
global fileExtensionMatch
global lineOneFlag
global lineTwoFlag
puidMatch = ""
mimeMatch= ""
status2Match = ""
warningMatch = ""
statusMatch = ""
filepathMatch = ""
filenameMatch = ""
fileExtensionMatch = ""
def doGetHeaderVariables(line):#matches header lines, strips sig version, saves as sigVersionMatch
global sigVersionMatch
M1 = headerLineOne.search(line)
M2 = headerLineTwo.search(line)
M3 = sigVersion.search(line)
if M3:
sigVersionMatch = str(M3.group())
def doStartProcessing(line):
global headerFlag
global lineCounter
if headerFlag == 0: #stops header match testing once successfully complete
doGetHeaderVariables(line)
headerFlag = 1
doSearches(line)
if __name__ == "__main__":
setGlobals()
global newfile
doInitialseVariables() #sets up the variable space to begin
directory = "C:\droid\logs\\"
extension = ".csv"
list_of_files = [file for file in os.listdir(directory) if file.lower().endswith(extension)]
for currentfile in list_of_files:
logpath = str(directory)+str(currentfile)
newfile = str(directory) + "cleaned\\" + str(currentfile)
for line in open(logpath,'r'):
doStartProcessing(line)
headerFlag = 0
lineCounter = 0
You only need to use the
globalstatement for names that you are going to reassign in the function. You can access any name you want without it, including accessing it to call a method on it that will modify the object.Your code would be vastly simplified if you used a class. The global variables would become attributes of the object, and your functions would become methods.
Don’t use tabs to indent, use spaces (the code would be readable above if you do this).
You don’t need to compile regexes before you use them. The
remodule compiles and caches for you automatically.