mike
/
SWA_Seniority_Calc


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238
							#!/usr/bin/python

import os
import pandas as pd
from bs4 import BeautifulSoup
import datetime
import sys, getopt
from dateutil.relativedelta import relativedelta
from stat import S_IREAD

########################

## ARGUMENTS ##

#if not len(sys.argv)==3:
#  print("Need Vacancy Seniority.pdf and Master Seniority.html files as inputs!")
#  sys.exit("Incorrent Arguments. " + sys.argv[0] + " aborted.")
#else:
#  VacancyFile=sys.argv[1]
#  MasterSenFile=sys.argv[2]


########################


##Script will attempt to read two Seniority Lists and merge them for data analysis
##Script will capture errors and throw excetpions
##Script will save result as a csv for archiving
##Script will delete original lists upon successful creation of merged csv


#######################

##LOGGING##

verbosity = 2 #0-silent, 1-debug, 2-verbose

def log(text, level=2):
  #takes text as argument, assumes logging level is verbose unless set)
  if level <= verbosity:
    print(text)

######################


#######################

#Read PDF seniority list (domicile Seniority from SWALIFE)
#Formerly pdf.py

def parse_vacsen(senfile):
  log("Starting parse_vacsen with file " + senfile,1)

  if os.path.exists(senfile):
    log("Executing pdftotext on " + str(senfile), 2)
    os.system('pdftotext -raw ' + senfile + ' test.txt')
    log("COMPLETE!", 2)
  else:
    raise Exception("File " + str(senfile) + "  not found")

  log("Opening test.txt and initializing lists and DataFrames", 2)
  myfile = open('test.txt', 'r')
  
  listPilots = [] #initiallize list to pass to DataFrame

  #initialize DataFrame
  dfPilots = pd.DataFrame(columns=('ID', 'Fname', 'Lname', 'DomSen', 'Dom', 'Etops', 'Seat', 'MstrSen', 'vacsendate', 'combined'))
  log("COMPLETE!", 2)

  for line in myfile:

    dictPilotdata = {'ID' : '', 'Fname' : '', 'Lname' : '', 'Dom' : '', 'Seat' : '', 'Etops' : '', 'DomSen' : '', 'MstrSen' : '', 'vacsendate' : '', 'combined':''}

    if line.startswith("VACANCY"):
      combined = "/" in line # Flag entries from OAK CA / OAK CA E and OAK FO / OAK FO E lists (duplicates)
      if not line.strip()[-1]=="E":
        domicile=line.strip()[-6:-3] # strip the Domicile
        seat=line.strip()[-2:]
        etops=0 
      else:
        domicile=line[-9:-6] #if ETOPS, strip domicile this way
        seat=line[-5:-2]
        etops=1

    if line.startswith("Effective"):
      reportdate=line.strip()[16:23] #Strip the report generation date
      reportdate=reportdate.strip()

    if not line.startswith(("Base", "BG", "Sen", "Page", "Effective", "VACANCY", "vacancy")):
      holder=line.split() #derive data from text file to space delimited list
      if len(holder)>1: #For lists with name data...
        pilot=holder #populate list _pilot_
        pilot=pilot[:4] #strip odd initials or **INTL designationscat
        if len(pilot)<4:
          pilot.append("X") #Fix for single name names (no first name)
      elif len(holder) == 1 and holder[0] != "E": #For lists with one item (dom sen)...
        pilot.append(holder[0]) #Add to _pilot_
        pilot.append(domicile) #Add _domicile_ to _pilot_
        pilot.append(seat) #Add _seat_ to _pilot_
        pilot.append(etops)
        pilot.append(combined)
        dictPilotdata['ID'] = pilot[1]
        dictPilotdata['Fname'] = pilot[3]
        dictPilotdata['Lname'] = pilot[2]
        dictPilotdata['Dom'] = pilot[5]
        dictPilotdata['Seat'] = pilot[6].strip()
        dictPilotdata['Etops'] = pilot[7]
        dictPilotdata['DomSen'] = pilot[0]
        dictPilotdata['MstrSen'] = pilot[4]
        dictPilotdata['vacsendate'] = reportdate
        dictPilotdata['combined'] = pilot[8]

        listPilots.append(dictPilotdata)
  myfile.close()

  log("Removing text.txt", 2)
  os.remove("test.txt")

  dfPilots = dfPilots.append(listPilots)
#  dfPilots = dfPilots.drop_duplicates(['ID'], keep='first', inplace=True)
#  test=dfPilots.drop_duplicates()

#  try:
# test.to_csv(r'pdf.csv')
#  except:
#    log("Could not save parse_vacsen DataFrame as pdf.csv", 0)

  return dfPilots

  log("parse_vacsen success", 1)

###################################


###################################

#Open html Seniority list from CWA and parse
#Formerly sen.py

def parse_mstrsen(mstrsenfile):
  log("Starting parse_mstrsen", 2)

  print("Starting parse_mstrsen with file: " + mstrsenfile)

  def my_date(strdate):
    mydate = datetime.datetime.strptime(strdate,'%d%b%y')
    #Parsed dates from senlist have 2 digit years.  Parsed date assume wrong century
    if mydate > datetime.datetime.now():  #If parsed date has wrong century (is later than today):
      mydate = mydate - relativedelta(years=100) # subtract 100 years from date
    return mydate

  f = open(mstrsenfile)
  soup = BeautifulSoup(f, 'html.parser')
  f.close()

  #Get all tables with class detailListTable
  table_list = soup.findAll('table', attrs={'class':'detailListTable'})

  count = 0

  #Initialize list which will pass data to DataFrame
  listPilots = []

  #Initialize DataDrame
  dfPilots = pd.DataFrame(columns=('ID', 'Name', 'TotalSen', 'DOH', 'CAdt', 'DOB'))

  #Iterate through each individual table
  for table in table_list:

      for row in table.findAll('tr'):  #Iterate through Rows
        dictPilotdata = {"ID":'', "Name": '', "TotalSen":'', "DOH":'', "CAdt": '', "DOB":''} #Initialize pilot attributes dictionary
        count = 0 # Reset Counter
        for col in row.find_all('td', class_=['centerAlign', 'leftAlign']): #Iterate through relavent TDs
          #Place data in dictionary based on counter
          if count == 0:
            dictPilotdata["TotalSen"] = str(col.text)
          elif count == 1:
            dictPilotdata["Name"] = str(col.text).strip()
          elif count == 3:
            dictPilotdata["ID"] = str(col.text)
          elif count == 4:
            dictPilotdata["DOH"] = my_date(str(col.text))
          elif count == 5 and not str(col.text) == '-':
            dictPilotdata["CAdt"] = my_date(str(col.text))
          elif count == 6:
            dictPilotdata["DOB"] = my_date(str(col.text))
          count+=1 #increment counter for next pilot data row

        #Add new dictionary entry with Key EmpNum and attributes dictionary as value
        if dictPilotdata['ID']: #Ignore blank rows
          listPilots.append(dictPilotdata)

  dfPilots = dfPilots.append(listPilots)

#  dfPilots.to_csv(r'sen.csv')

  return dfPilots

  log("parse_mstrsen SUCCESS",1)

###########################################################


##########################################################


###Merge csv files into a new file, and delete pre-merged files
#Formerly merge.py

def mergesen(df_domsen, df_vacsen, build_csv=True):
  log("Starting mergesen", 1)

  df_merged = pd.merge(df_domsen, df_vacsen, on='ID', how='inner')

  reportdate = df_merged.iloc[0]['vacsendate']

  filename = reportdate + '_merged_senlist.csv'

#  df_merged = df_merged[df_merged.combined != True]

  df_merged = df_merged.drop_duplicates(subset='ID', keep='first', inplace=False)

  if build_csv:
    log("Building merged.csv", 2)
    df_merged.to_csv(filename)

  try:
    #change permissions on file
    os.chmod(filename, 0o550)
    log("Successfully changed " + filename + " to read-only", 2)
  except:
    log("Could not modify permissions to read only for file " + filename, 2)

  log("mergesen SUCCESS!", 1)

  return df_merged