| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238 |
- #!/usr/bin/python
- import os
- import pandas as pd
- from bs4 import BeautifulSoup
- import datetime
- import sys, getopt
- from dateutil.relativedelta import relativedelta
- from stat import S_IREAD
- ########################
- ## ARGUMENTS ##
- #if not len(sys.argv)==3:
- # print("Need Vacancy Seniority.pdf and Master Seniority.html files as inputs!")
- # sys.exit("Incorrent Arguments. " + sys.argv[0] + " aborted.")
- #else:
- # VacancyFile=sys.argv[1]
- # MasterSenFile=sys.argv[2]
- ########################
- ##Script will attempt to read two Seniority Lists and merge them for data analysis
- ##Script will capture errors and throw excetpions
- ##Script will save result as a csv for archiving
- ##Script will delete original lists upon successful creation of merged csv
- #######################
- ##LOGGING##
- verbosity = 2 #0-silent, 1-debug, 2-verbose
- def log(text, level=2):
- #takes text as argument, assumes logging level is verbose unless set)
- if level <= verbosity:
- print(text)
- ######################
- #######################
- #Read PDF seniority list (domicile Seniority from SWALIFE)
- #Formerly pdf.py
- def parse_vacsen(senfile):
- log("Starting parse_vacsen with file " + senfile,1)
- if os.path.exists(senfile):
- log("Executing pdftotext on " + str(senfile), 2)
- os.system('pdftotext -raw ' + senfile + ' test.txt')
- log("COMPLETE!", 2)
- else:
- raise Exception("File " + str(senfile) + " not found")
- log("Opening test.txt and initializing lists and DataFrames", 2)
- myfile = open('test.txt', 'r')
-
- listPilots = [] #initiallize list to pass to DataFrame
- #initialize DataFrame
- dfPilots = pd.DataFrame(columns=('ID', 'Fname', 'Lname', 'DomSen', 'Dom', 'Etops', 'Seat', 'MstrSen', 'vacsendate', 'combined'))
- log("COMPLETE!", 2)
- for line in myfile:
- dictPilotdata = {'ID' : '', 'Fname' : '', 'Lname' : '', 'Dom' : '', 'Seat' : '', 'Etops' : '', 'DomSen' : '', 'MstrSen' : '', 'vacsendate' : '', 'combined':''}
- if line.startswith("VACANCY"):
- combined = "/" in line # Flag entries from OAK CA / OAK CA E and OAK FO / OAK FO E lists (duplicates)
- if not line.strip()[-1]=="E":
- domicile=line.strip()[-6:-3] # strip the Domicile
- seat=line.strip()[-2:]
- etops=0
- else:
- domicile=line[-9:-6] #if ETOPS, strip domicile this way
- seat=line[-5:-2]
- etops=1
- if line.startswith("Effective"):
- reportdate=line.strip()[16:23] #Strip the report generation date
- reportdate=reportdate.strip()
- if not line.startswith(("Base", "BG", "Sen", "Page", "Effective", "VACANCY", "vacancy")):
- holder=line.split() #derive data from text file to space delimited list
- if len(holder)>1: #For lists with name data...
- pilot=holder #populate list _pilot_
- pilot=pilot[:4] #strip odd initials or **INTL designationscat
- if len(pilot)<4:
- pilot.append("X") #Fix for single name names (no first name)
- elif len(holder) == 1 and holder[0] != "E": #For lists with one item (dom sen)...
- pilot.append(holder[0]) #Add to _pilot_
- pilot.append(domicile) #Add _domicile_ to _pilot_
- pilot.append(seat) #Add _seat_ to _pilot_
- pilot.append(etops)
- pilot.append(combined)
- dictPilotdata['ID'] = pilot[1]
- dictPilotdata['Fname'] = pilot[3]
- dictPilotdata['Lname'] = pilot[2]
- dictPilotdata['Dom'] = pilot[5]
- dictPilotdata['Seat'] = pilot[6].strip()
- dictPilotdata['Etops'] = pilot[7]
- dictPilotdata['DomSen'] = pilot[0]
- dictPilotdata['MstrSen'] = pilot[4]
- dictPilotdata['vacsendate'] = reportdate
- dictPilotdata['combined'] = pilot[8]
- listPilots.append(dictPilotdata)
- myfile.close()
- log("Removing text.txt", 2)
- os.remove("test.txt")
- dfPilots = dfPilots.append(listPilots)
- # dfPilots = dfPilots.drop_duplicates(['ID'], keep='first', inplace=True)
- # test=dfPilots.drop_duplicates()
- # try:
- # test.to_csv(r'pdf.csv')
- # except:
- # log("Could not save parse_vacsen DataFrame as pdf.csv", 0)
- return dfPilots
- log("parse_vacsen success", 1)
- ###################################
- ###################################
- #Open html Seniority list from CWA and parse
- #Formerly sen.py
- def parse_mstrsen(mstrsenfile):
- log("Starting parse_mstrsen", 2)
- print("Starting parse_mstrsen with file: " + mstrsenfile)
- def my_date(strdate):
- mydate = datetime.datetime.strptime(strdate,'%d%b%y')
- #Parsed dates from senlist have 2 digit years. Parsed date assume wrong century
- if mydate > datetime.datetime.now(): #If parsed date has wrong century (is later than today):
- mydate = mydate - relativedelta(years=100) # subtract 100 years from date
- return mydate
- f = open(mstrsenfile)
- soup = BeautifulSoup(f, 'html.parser')
- f.close()
- #Get all tables with class detailListTable
- table_list = soup.findAll('table', attrs={'class':'detailListTable'})
- count = 0
- #Initialize list which will pass data to DataFrame
- listPilots = []
- #Initialize DataDrame
- dfPilots = pd.DataFrame(columns=('ID', 'Name', 'TotalSen', 'DOH', 'CAdt', 'DOB'))
- #Iterate through each individual table
- for table in table_list:
- for row in table.findAll('tr'): #Iterate through Rows
- dictPilotdata = {"ID":'', "Name": '', "TotalSen":'', "DOH":'', "CAdt": '', "DOB":''} #Initialize pilot attributes dictionary
- count = 0 # Reset Counter
- for col in row.find_all('td', class_=['centerAlign', 'leftAlign']): #Iterate through relavent TDs
- #Place data in dictionary based on counter
- if count == 0:
- dictPilotdata["TotalSen"] = str(col.text)
- elif count == 1:
- dictPilotdata["Name"] = str(col.text).strip()
- elif count == 3:
- dictPilotdata["ID"] = str(col.text)
- elif count == 4:
- dictPilotdata["DOH"] = my_date(str(col.text))
- elif count == 5 and not str(col.text) == '-':
- dictPilotdata["CAdt"] = my_date(str(col.text))
- elif count == 6:
- dictPilotdata["DOB"] = my_date(str(col.text))
- count+=1 #increment counter for next pilot data row
- #Add new dictionary entry with Key EmpNum and attributes dictionary as value
- if dictPilotdata['ID']: #Ignore blank rows
- listPilots.append(dictPilotdata)
- dfPilots = dfPilots.append(listPilots)
- # dfPilots.to_csv(r'sen.csv')
- return dfPilots
- log("parse_mstrsen SUCCESS",1)
- ###########################################################
- ##########################################################
- ###Merge csv files into a new file, and delete pre-merged files
- #Formerly merge.py
- def mergesen(df_domsen, df_vacsen, build_csv=True):
- log("Starting mergesen", 1)
- df_merged = pd.merge(df_domsen, df_vacsen, on='ID', how='inner')
- reportdate = df_merged.iloc[0]['vacsendate']
- filename = reportdate + '_merged_senlist.csv'
- # df_merged = df_merged[df_merged.combined != True]
- df_merged = df_merged.drop_duplicates(subset='ID', keep='first', inplace=False)
- if build_csv:
- log("Building merged.csv", 2)
- df_merged.to_csv(filename)
- try:
- #change permissions on file
- os.chmod(filename, 0o550)
- log("Successfully changed " + filename + " to read-only", 2)
- except:
- log("Could not modify permissions to read only for file " + filename, 2)
- log("mergesen SUCCESS!", 1)
- return df_merged
|