2023-05-15 11:37:16 +00:00
import sys
2023-05-17 11:42:56 +00:00
sys . path . append ( ' ../group-1 ' )
2023-05-15 17:16:50 +00:00
import math
2023-05-10 11:55:07 +00:00
import pandas as pd
2023-05-15 17:16:50 +00:00
import os
2023-05-15 11:37:16 +00:00
from scraper . top100_extractor import programming_crime_list
2023-05-15 17:16:50 +00:00
import numpy as np
2023-05-19 21:17:19 +00:00
import random
2023-05-15 17:16:50 +00:00
2023-05-15 11:37:16 +00:00
pd . set_option ( ' display.max_rows ' , 500 )
2023-05-10 11:55:07 +00:00
def get_peg ( ticker : str ) :
2023-05-15 11:37:16 +00:00
2023-05-19 21:17:19 +00:00
current_ratios = pd . read_csv ( f ' Companies_Data/ { ticker } _Data/ { ticker } _current_ratios.csv ' , index_col = [ 0 ] ) #Read current ratios .csv. Check if it exists
2023-05-15 11:37:16 +00:00
2023-05-19 21:17:19 +00:00
current_ratios [ ' asOfDate ' ] = pd . to_datetime ( current_ratios [ ' asOfDate ' ] ) #Convert Object to DateTime
2023-05-10 11:55:07 +00:00
2023-05-19 21:17:19 +00:00
current_ratios = current_ratios . sort_values ( ' asOfDate ' , ascending = False ) # Sorting per Date
2023-05-10 11:55:07 +00:00
current_ratios = current_ratios . dropna ( )
# Take first value (the last peg ratio)
2023-05-15 11:37:16 +00:00
# If it does not exist, it returns 0
try :
if len ( current_ratios [ ' PegRatio ' ] ) > 0 :
peg_ratio = current_ratios [ ' PegRatio ' ] . iloc [ : 1 ]
else :
return 0.0
except KeyError :
return 0.0
2023-05-10 11:55:07 +00:00
return peg_ratio . values [ 0 ]
2023-05-19 21:17:19 +00:00
2023-05-10 11:55:07 +00:00
def get_financial_health ( ticker : str ) :
2023-05-19 21:17:19 +00:00
balance_sheet = pd . read_csv ( f ' Companies_Data/ { ticker } _Data/ { ticker } _balance_sheet_4Y+4Q.csv ' , index_col = [ 0 ] ) # Read balance sheet .csv
2023-05-10 11:55:07 +00:00
2023-05-19 21:17:19 +00:00
balance_sheet [ ' asOfDate ' ] = pd . to_datetime ( balance_sheet [ ' asOfDate ' ] ) # Convert Object to DateTime
2023-05-10 11:55:07 +00:00
2023-05-19 21:17:19 +00:00
balance_sheet = balance_sheet . sort_values ( ' asOfDate ' , ascending = False ) # Sorting per Date
2023-05-10 11:55:07 +00:00
balance_sheet = balance_sheet . dropna ( )
# Create financial health column
2023-05-15 11:37:16 +00:00
try :
2023-05-29 15:30:36 +00:00
balance_sheet [ ' financial_health ' ] = balance_sheet [ ' TotalAssets ' ] / balance_sheet [ ' TotalDebt ' ]
2023-05-15 11:37:16 +00:00
except KeyError :
2023-05-15 17:16:50 +00:00
return 2.0
2023-05-10 11:55:07 +00:00
# Get financial health
2023-05-15 11:37:16 +00:00
financial_health = balance_sheet [ ' financial_health ' ] . iloc [ : 1 ]
2023-05-10 11:55:07 +00:00
return financial_health . values [ 0 ]
2023-05-19 21:17:19 +00:00
2023-05-10 11:55:07 +00:00
def estimated_growth ( ticker : str ) :
2023-05-19 21:17:19 +00:00
growth_estimated = pd . read_csv ( f ' Companies_Data/ { ticker } _Data/ { ticker } 5YGrowthEstimates.csv ' , index_col = [ 0 ] ) [ ' 5Y Growth estimate ' ] . values [ 0 ] # Read 5 years growth estimates
2023-05-15 11:37:16 +00:00
2023-05-10 11:55:07 +00:00
return growth_estimated
2023-05-19 21:17:19 +00:00
def employees_over_time ( ticker : str ) :
employee_df = pd . read_csv ( f ' Companies_Data/ { ticker } _Data/ { ticker } meta_data.csv ' ) #get df to retrieve employee number of the company
employee_number = employee_df . at [ 0 , ' number_employees ' ]
lst = [ employee_number ] #What does this loop do? --> you start from the actual value of employee number of the company and then create absolutely false values for
# previous years, using uniform distribution subtraction with the number at i. This makes so that the trend, once you reverse the list, is
# growing over the years with some random fluctuations (just like how the number of employees grows over time, it's not like y=x)
for i in range ( 0 , 11 ) :
lst . append ( lst [ i ] + random . uniform ( - 0.2 * lst [ i ] , 0.1 * lst [ i ] ) )
lst . reverse ( )
return lst
2023-05-13 14:52:31 +00:00
def past_performance_earnings ( ticker : str ) :
2023-05-19 21:17:19 +00:00
earnings = pd . read_csv ( f ' Companies_Data/ { ticker } _Data/ { ticker } earnings.csv ' , index_col = [ 0 ] ) # Read earnings csv
2023-05-15 11:37:16 +00:00
2023-05-19 21:17:19 +00:00
performance_index = round ( ( earnings [ ' epsActual ' ] . sum ( ) - earnings [ ' epsEstimate ' ] . sum ( ) ) / earnings [ ' epsEstimate ' ] . sum ( ) * 100 , 2 ) #Performance
2023-05-13 14:52:31 +00:00
return performance_index
2023-05-19 21:17:19 +00:00
2023-05-15 11:37:16 +00:00
def normalizer ( ) :
2023-05-17 08:39:32 +00:00
''' Normalize the dataframe columns to a range between 0 and 200 '''
2023-05-19 21:17:19 +00:00
not_normalized = pd . read_csv ( ' Elaborated_Data/Not_Normalized.csv ' ) # Read Not_normalized .csv
2023-05-30 14:40:26 +00:00
# v_values = (200/(1+math.e**( 0.1*(-not_normalized['Valuation'].mean()+not_normalized['Valuation'])))) #VALUATION STAT
v_values = ( 200 / ( 1 + ( 1 / 9 * not_normalized [ ' Valuation ' ] * * 2 ) ) ) # VALUATION STAT
2023-05-15 11:37:16 +00:00
not_normalized [ ' Valuation ' ] = v_values
2023-05-30 14:40:26 +00:00
fh_values = ( 200 / ( 1 + math . e * * ( - 0.1 * ( - not_normalized [ ' Financial Health ' ] . mean ( ) + not_normalized [ ' Financial Health ' ] ) ) ) ) #FINANCIAL HEALTH STAT
2023-05-15 17:16:50 +00:00
not_normalized [ ' Financial Health ' ] = fh_values
not_normalized [ ' Estimated Growth ' ] = not_normalized [ ' Estimated Growth ' ] . str . strip ( " % " ) . astype ( " float " )
2023-05-29 15:30:36 +00:00
eg_values = ( 200 / ( 1 + math . e * * ( - 0.1 * ( - not_normalized [ ' Estimated Growth ' ] . mean ( ) + not_normalized [ ' Estimated Growth ' ] ) ) ) ) #ESTIMATED GROWTH STAT
2023-05-15 17:16:50 +00:00
for i in range ( len ( eg_values ) ) :
eg_values [ i ] = float ( round ( eg_values [ i ] , 2 ) )
not_normalized [ ' Estimated Growth ' ] = eg_values
2023-05-15 11:37:16 +00:00
2023-05-30 14:40:26 +00:00
pf_values = ( 200 / ( 1 + math . e * * ( - 0.05 * ( - not_normalized [ ' Past Performance ' ] . mean ( ) + not_normalized [ ' Past Performance ' ] ) ) ) ) #PAST PERFORMANCE
2023-05-15 17:16:50 +00:00
not_normalized [ ' Past Performance ' ] = pf_values
2023-05-15 11:37:16 +00:00
2023-05-17 08:39:32 +00:00
# Create normalized dataframe for main page
2023-05-15 17:16:50 +00:00
not_normalized . to_csv ( r ' Elaborated_Data/normalized_data.csv ' )
2023-05-15 11:37:16 +00:00
def create_df ( companies_list ) :
2023-05-19 21:17:19 +00:00
2023-05-15 11:37:16 +00:00
d = {
' Ticker ' : [ ] ,
' Valuation ' : [ ] ,
' Financial Health ' : [ ] ,
' Estimated Growth ' : [ ] ,
' Past Performance ' : [ ]
}
2023-05-19 21:17:19 +00:00
d_emp = {
' Ticker ' : [ ] ,
' Employees_over_time ' : [ ]
}
2023-05-15 11:37:16 +00:00
for company in companies_list :
d [ ' Ticker ' ] . append ( company )
d [ ' Valuation ' ] . append ( get_peg ( company ) )
d [ ' Financial Health ' ] . append ( get_financial_health ( company ) )
d [ ' Estimated Growth ' ] . append ( estimated_growth ( company ) )
d [ ' Past Performance ' ] . append ( past_performance_earnings ( company ) )
2023-05-19 21:17:19 +00:00
d_emp [ ' Ticker ' ] . append ( company )
d_emp [ ' Employees_over_time ' ] . append ( employees_over_time ( company ) )
2023-05-15 11:37:16 +00:00
df = pd . DataFrame ( data = d )
df . to_csv ( " Elaborated_Data/Not_Normalized.csv " )
2023-05-10 11:55:07 +00:00
2023-05-19 21:17:19 +00:00
df_employees = pd . DataFrame ( data = d_emp )
df_employees . to_csv ( r " Elaborated_Data/employees_over_time.csv " )
2023-05-15 11:37:16 +00:00
def main ( ) :
2023-05-19 21:17:19 +00:00
2023-05-15 17:16:50 +00:00
if not os . path . exists ( r " Elaborated_Data " ) :
os . mkdir ( r " Elaborated_Data " )
create_df ( programming_crime_list )
2023-05-15 11:37:16 +00:00
normalizer ( )
if __name__ == ' __main__ ' :
main ( )
2023-05-19 21:17:19 +00:00