This repository has been archived on 2023-06-18. You can view files and clone it, but cannot push or open issues or pull requests.
va-project/indexer/indexer.py

161 lines
5.4 KiB
Python
Raw Normal View History

2023-05-15 11:37:16 +00:00
import sys
sys.path.append('../group-1')
2023-05-15 17:16:50 +00:00
import math
import pandas as pd
2023-05-15 17:16:50 +00:00
import os
2023-05-15 11:37:16 +00:00
from scraper.top100_extractor import programming_crime_list
2023-05-15 17:16:50 +00:00
import numpy as np
2023-05-15 11:37:16 +00:00
from sklearn import preprocessing
2023-05-19 21:17:19 +00:00
import random
2023-05-15 17:16:50 +00:00
2023-05-15 11:37:16 +00:00
pd.set_option('display.max_rows', 500)
def get_peg(ticker: str):
2023-05-15 11:37:16 +00:00
2023-05-19 21:17:19 +00:00
current_ratios = pd.read_csv(f'Companies_Data/{ticker}_Data/{ticker}_current_ratios.csv', index_col=[0]) #Read current ratios .csv. Check if it exists
2023-05-15 11:37:16 +00:00
2023-05-19 21:17:19 +00:00
current_ratios['asOfDate'] = pd.to_datetime(current_ratios['asOfDate']) #Convert Object to DateTime
2023-05-19 21:17:19 +00:00
current_ratios = current_ratios.sort_values('asOfDate', ascending=False) # Sorting per Date
current_ratios = current_ratios.dropna()
# Take first value (the last peg ratio)
2023-05-15 11:37:16 +00:00
# If it does not exist, it returns 0
try:
if len(current_ratios['PegRatio']) > 0:
peg_ratio = current_ratios['PegRatio'].iloc[:1]
else:
return 0.0
except KeyError:
return 0.0
return peg_ratio.values[0]
2023-05-19 21:17:19 +00:00
def get_financial_health(ticker: str):
2023-05-19 21:17:19 +00:00
balance_sheet = pd.read_csv(f'Companies_Data/{ticker}_Data/{ticker}_balance_sheet_4Y+4Q.csv', index_col=[0]) # Read balance sheet .csv
2023-05-19 21:17:19 +00:00
balance_sheet['asOfDate'] = pd.to_datetime(balance_sheet['asOfDate']) # Convert Object to DateTime
2023-05-19 21:17:19 +00:00
balance_sheet = balance_sheet.sort_values('asOfDate', ascending=False) # Sorting per Date
balance_sheet = balance_sheet.dropna()
# Create financial health column
2023-05-15 11:37:16 +00:00
try:
balance_sheet['financial_health'] = balance_sheet['TotalDebt'] / balance_sheet['TotalAssets']
except KeyError:
2023-05-15 17:16:50 +00:00
return 2.0
# Get financial health
2023-05-15 11:37:16 +00:00
financial_health = balance_sheet['financial_health'].iloc[:1]
return financial_health.values[0]
2023-05-19 21:17:19 +00:00
def estimated_growth(ticker: str):
2023-05-19 21:17:19 +00:00
growth_estimated = pd.read_csv(f'Companies_Data/{ticker}_Data/{ticker}5YGrowthEstimates.csv', index_col=[0])['5Y Growth estimate'].values[0] # Read 5 years growth estimates
2023-05-15 11:37:16 +00:00
return growth_estimated
2023-05-19 21:17:19 +00:00
def employees_over_time(ticker: str):
employee_df = pd.read_csv(f'Companies_Data/{ticker}_Data/{ticker}meta_data.csv') #get df to retrieve employee number of the company
employee_number = employee_df.at[0, 'number_employees']
lst = [employee_number] #What does this loop do? --> you start from the actual value of employee number of the company and then create absolutely false values for
# previous years, using uniform distribution subtraction with the number at i. This makes so that the trend, once you reverse the list, is
# growing over the years with some random fluctuations (just like how the number of employees grows over time, it's not like y=x)
for i in range(0, 11):
lst.append(lst[i] + random.uniform(-0.2*lst[i], 0.1*lst[i]))
lst.reverse()
return lst
def past_performance_earnings(ticker: str):
2023-05-19 21:17:19 +00:00
earnings = pd.read_csv(f'Companies_Data/{ticker}_Data/{ticker}earnings.csv', index_col=[0]) # Read earnings csv
2023-05-15 11:37:16 +00:00
2023-05-19 21:17:19 +00:00
performance_index = round((earnings['epsActual'].sum() - earnings['epsEstimate'].sum()) / earnings['epsEstimate'].sum() * 100, 2) #Performance
return performance_index
2023-05-19 21:17:19 +00:00
2023-05-15 11:37:16 +00:00
def normalizer():
2023-05-17 08:39:32 +00:00
''' Normalize the dataframe columns to a range between 0 and 200'''
2023-05-19 21:17:19 +00:00
not_normalized = pd.read_csv('Elaborated_Data/Not_Normalized.csv') # Read Not_normalized .csv
2023-05-15 17:16:50 +00:00
v_values = (200/(1+math.e**( 0.2*(-not_normalized['Valuation'].mean()+not_normalized['Valuation'])))) #VALUATION STAT
2023-05-15 11:37:16 +00:00
not_normalized['Valuation'] = v_values
2023-05-15 17:16:50 +00:00
fh_values= (80/not_normalized['Financial Health'].mean())*not_normalized['Financial Health'] #FINANCIAL HEALTH STAT
not_normalized['Financial Health'] = fh_values
not_normalized['Estimated Growth'] = not_normalized['Estimated Growth'].str.strip("%").astype("float")
eg_values= (200/(1+math.e**( 0.08*(not_normalized['Estimated Growth'].mean()-not_normalized['Estimated Growth'])))) #ESTIMATED GROWTH STAT
for i in range(len(eg_values)):
eg_values[i] = float(round(eg_values[i],2))
not_normalized['Estimated Growth']= eg_values
2023-05-15 11:37:16 +00:00
2023-05-15 17:16:50 +00:00
pf_values = (200/(1+math.e**( 0.08*(not_normalized['Past Performance'].mean()-not_normalized['Past Performance'])))) #PAST PERFORMANCE
not_normalized['Past Performance'] = pf_values
2023-05-15 11:37:16 +00:00
2023-05-17 08:39:32 +00:00
# Create normalized dataframe for main page
2023-05-15 17:16:50 +00:00
not_normalized.to_csv(r'Elaborated_Data/normalized_data.csv')
2023-05-15 11:37:16 +00:00
def create_df(companies_list):
2023-05-19 21:17:19 +00:00
2023-05-15 11:37:16 +00:00
d = {
'Ticker': [],
'Valuation' : [],
'Financial Health': [],
'Estimated Growth': [],
'Past Performance': []
}
2023-05-19 21:17:19 +00:00
d_emp = {
'Ticker': [],
'Employees_over_time': []
}
2023-05-15 11:37:16 +00:00
for company in companies_list:
d['Ticker'].append(company)
d['Valuation'].append(get_peg(company))
d['Financial Health'].append(get_financial_health(company))
d['Estimated Growth'].append(estimated_growth(company))
d['Past Performance'].append(past_performance_earnings(company))
2023-05-19 21:17:19 +00:00
d_emp['Ticker'].append(company)
d_emp['Employees_over_time'].append(employees_over_time(company))
2023-05-15 11:37:16 +00:00
df = pd.DataFrame(data=d)
df.to_csv("Elaborated_Data/Not_Normalized.csv")
2023-05-19 21:17:19 +00:00
df_employees = pd.DataFrame(data=d_emp)
df_employees.to_csv(r"Elaborated_Data/employees_over_time.csv")
2023-05-15 11:37:16 +00:00
def main():
2023-05-19 21:17:19 +00:00
2023-05-15 17:16:50 +00:00
if not os.path.exists(r"Elaborated_Data"):
os.mkdir(r"Elaborated_Data")
create_df(programming_crime_list)
2023-05-15 11:37:16 +00:00
normalizer()
if __name__ == '__main__':
main()
2023-05-19 21:17:19 +00:00