parse.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238
  1. #!/usr/bin/python
  2. import os
  3. import pandas as pd
  4. from bs4 import BeautifulSoup
  5. import datetime
  6. import sys, getopt
  7. from dateutil.relativedelta import relativedelta
  8. from stat import S_IREAD
  9. ########################
  10. ## ARGUMENTS ##
  11. #if not len(sys.argv)==3:
  12. # print("Need Vacancy Seniority.pdf and Master Seniority.html files as inputs!")
  13. # sys.exit("Incorrent Arguments. " + sys.argv[0] + " aborted.")
  14. #else:
  15. # VacancyFile=sys.argv[1]
  16. # MasterSenFile=sys.argv[2]
  17. ########################
  18. ##Script will attempt to read two Seniority Lists and merge them for data analysis
  19. ##Script will capture errors and throw excetpions
  20. ##Script will save result as a csv for archiving
  21. ##Script will delete original lists upon successful creation of merged csv
  22. #######################
  23. ##LOGGING##
  24. verbosity = 2 #0-silent, 1-debug, 2-verbose
  25. def log(text, level=2):
  26. #takes text as argument, assumes logging level is verbose unless set)
  27. if level <= verbosity:
  28. print(text)
  29. ######################
  30. #######################
  31. #Read PDF seniority list (domicile Seniority from SWALIFE)
  32. #Formerly pdf.py
  33. def parse_vacsen(senfile):
  34. log("Starting parse_vacsen with file " + senfile,1)
  35. if os.path.exists(senfile):
  36. log("Executing pdftotext on " + str(senfile), 2)
  37. os.system('pdftotext -raw ' + senfile + ' test.txt')
  38. log("COMPLETE!", 2)
  39. else:
  40. raise Exception("File " + str(senfile) + " not found")
  41. log("Opening test.txt and initializing lists and DataFrames", 2)
  42. myfile = open('test.txt', 'r')
  43. listPilots = [] #initiallize list to pass to DataFrame
  44. #initialize DataFrame
  45. dfPilots = pd.DataFrame(columns=('ID', 'Fname', 'Lname', 'DomSen', 'Dom', 'Etops', 'Seat', 'MstrSen', 'vacsendate', 'combined'))
  46. log("COMPLETE!", 2)
  47. for line in myfile:
  48. dictPilotdata = {'ID' : '', 'Fname' : '', 'Lname' : '', 'Dom' : '', 'Seat' : '', 'Etops' : '', 'DomSen' : '', 'MstrSen' : '', 'vacsendate' : '', 'combined':''}
  49. if line.startswith("VACANCY"):
  50. combined = "/" in line # Flag entries from OAK CA / OAK CA E and OAK FO / OAK FO E lists (duplicates)
  51. if not line.strip()[-1]=="E":
  52. domicile=line.strip()[-6:-3] # strip the Domicile
  53. seat=line.strip()[-2:]
  54. etops=0
  55. else:
  56. domicile=line[-9:-6] #if ETOPS, strip domicile this way
  57. seat=line[-5:-2]
  58. etops=1
  59. if line.startswith("Effective"):
  60. reportdate=line.strip()[16:23] #Strip the report generation date
  61. reportdate=reportdate.strip()
  62. if not line.startswith(("Base", "BG", "Sen", "Page", "Effective", "VACANCY", "vacancy")):
  63. holder=line.split() #derive data from text file to space delimited list
  64. if len(holder)>1: #For lists with name data...
  65. pilot=holder #populate list _pilot_
  66. pilot=pilot[:4] #strip odd initials or **INTL designationscat
  67. if len(pilot)<4:
  68. pilot.append("X") #Fix for single name names (no first name)
  69. elif len(holder) == 1 and holder[0] != "E": #For lists with one item (dom sen)...
  70. pilot.append(holder[0]) #Add to _pilot_
  71. pilot.append(domicile) #Add _domicile_ to _pilot_
  72. pilot.append(seat) #Add _seat_ to _pilot_
  73. pilot.append(etops)
  74. pilot.append(combined)
  75. dictPilotdata['ID'] = pilot[1]
  76. dictPilotdata['Fname'] = pilot[3]
  77. dictPilotdata['Lname'] = pilot[2]
  78. dictPilotdata['Dom'] = pilot[5]
  79. dictPilotdata['Seat'] = pilot[6].strip()
  80. dictPilotdata['Etops'] = pilot[7]
  81. dictPilotdata['DomSen'] = pilot[0]
  82. dictPilotdata['MstrSen'] = pilot[4]
  83. dictPilotdata['vacsendate'] = reportdate
  84. dictPilotdata['combined'] = pilot[8]
  85. listPilots.append(dictPilotdata)
  86. myfile.close()
  87. log("Removing text.txt", 2)
  88. os.remove("test.txt")
  89. dfPilots = dfPilots.append(listPilots)
  90. # dfPilots = dfPilots.drop_duplicates(['ID'], keep='first', inplace=True)
  91. # test=dfPilots.drop_duplicates()
  92. # try:
  93. # test.to_csv(r'pdf.csv')
  94. # except:
  95. # log("Could not save parse_vacsen DataFrame as pdf.csv", 0)
  96. return dfPilots
  97. log("parse_vacsen success", 1)
  98. ###################################
  99. ###################################
  100. #Open html Seniority list from CWA and parse
  101. #Formerly sen.py
  102. def parse_mstrsen(mstrsenfile):
  103. log("Starting parse_mstrsen", 2)
  104. print("Starting parse_mstrsen with file: " + mstrsenfile)
  105. def my_date(strdate):
  106. mydate = datetime.datetime.strptime(strdate,'%d%b%y')
  107. #Parsed dates from senlist have 2 digit years. Parsed date assume wrong century
  108. if mydate > datetime.datetime.now(): #If parsed date has wrong century (is later than today):
  109. mydate = mydate - relativedelta(years=100) # subtract 100 years from date
  110. return mydate
  111. f = open(mstrsenfile)
  112. soup = BeautifulSoup(f, 'html.parser')
  113. f.close()
  114. #Get all tables with class detailListTable
  115. table_list = soup.findAll('table', attrs={'class':'detailListTable'})
  116. count = 0
  117. #Initialize list which will pass data to DataFrame
  118. listPilots = []
  119. #Initialize DataDrame
  120. dfPilots = pd.DataFrame(columns=('ID', 'Name', 'TotalSen', 'DOH', 'CAdt', 'DOB'))
  121. #Iterate through each individual table
  122. for table in table_list:
  123. for row in table.findAll('tr'): #Iterate through Rows
  124. dictPilotdata = {"ID":'', "Name": '', "TotalSen":'', "DOH":'', "CAdt": '', "DOB":''} #Initialize pilot attributes dictionary
  125. count = 0 # Reset Counter
  126. for col in row.find_all('td', class_=['centerAlign', 'leftAlign']): #Iterate through relavent TDs
  127. #Place data in dictionary based on counter
  128. if count == 0:
  129. dictPilotdata["TotalSen"] = str(col.text)
  130. elif count == 1:
  131. dictPilotdata["Name"] = str(col.text).strip()
  132. elif count == 3:
  133. dictPilotdata["ID"] = str(col.text)
  134. elif count == 4:
  135. dictPilotdata["DOH"] = my_date(str(col.text))
  136. elif count == 5 and not str(col.text) == '-':
  137. dictPilotdata["CAdt"] = my_date(str(col.text))
  138. elif count == 6:
  139. dictPilotdata["DOB"] = my_date(str(col.text))
  140. count+=1 #increment counter for next pilot data row
  141. #Add new dictionary entry with Key EmpNum and attributes dictionary as value
  142. if dictPilotdata['ID']: #Ignore blank rows
  143. listPilots.append(dictPilotdata)
  144. dfPilots = dfPilots.append(listPilots)
  145. # dfPilots.to_csv(r'sen.csv')
  146. return dfPilots
  147. log("parse_mstrsen SUCCESS",1)
  148. ###########################################################
  149. ##########################################################
  150. ###Merge csv files into a new file, and delete pre-merged files
  151. #Formerly merge.py
  152. def mergesen(df_domsen, df_vacsen, build_csv=True):
  153. log("Starting mergesen", 1)
  154. df_merged = pd.merge(df_domsen, df_vacsen, on='ID', how='inner')
  155. reportdate = df_merged.iloc[0]['vacsendate']
  156. filename = reportdate + '_merged_senlist.csv'
  157. # df_merged = df_merged[df_merged.combined != True]
  158. df_merged = df_merged.drop_duplicates(subset='ID', keep='first', inplace=False)
  159. if build_csv:
  160. log("Building merged.csv", 2)
  161. df_merged.to_csv(filename)
  162. try:
  163. #change permissions on file
  164. os.chmod(filename, 0o550)
  165. log("Successfully changed " + filename + " to read-only", 2)
  166. except:
  167. log("Could not modify permissions to read only for file " + filename, 2)
  168. log("mergesen SUCCESS!", 1)
  169. return df_merged