This repository has been archived on 2023-06-18. You can view files and clone it, but cannot push or open issues or pull requests.
va-project/scraper/retriever.py

172 lines
4.8 KiB
Python
Raw Normal View History

2023-05-08 09:26:25 +00:00
#!/usr/bin/env python3
2023-05-19 21:17:19 +00:00
2023-05-08 09:26:25 +00:00
import pandas as pd
import os
from datetime import datetime, timedelta
from yahooquery import Ticker
from top100_extractor import programming_crime_list
import requests
from bs4 import BeautifulSoup
from top100_extractor import programming_crime_list
from PIL import Image
import yfinance as yf
from yahoo_fin import stock_info as si
2023-05-19 21:17:19 +00:00
def get_employee_number(ticker1):
try:
company = Ticker(ticker1)
info = company.asset_profile[ticker1]
employee_number = info.get('fullTimeEmployees')
return employee_number
except Exception as e:
print(f"Error retrieving employee number: {e}")
return 0
2023-05-10 09:35:58 +00:00
def get_market_cap(ticker1):
2023-05-10 09:35:58 +00:00
ticker = Ticker(ticker1)
summary = ticker.summary_detail
market_cap = summary[ticker1.upper()]['marketCap']
return market_cap
2023-05-13 14:36:24 +00:00
def get_earnings(ticker):
aapl = Ticker(ticker)
performance = aapl.earning_history
return performance
2023-05-10 09:35:58 +00:00
def get_analyst_estimates(ticker):
estimates = si.get_analysts_info(ticker)
next_5_years_estimates = estimates["Growth Estimates"].iloc[4].dropna()
return next_5_years_estimates[1]
2023-05-08 09:26:25 +00:00
def get_company_data(ticker):
df = pd.read_csv(os.path.join(os.path.dirname(__file__), r'companies.csv'))##
2023-05-08 09:26:25 +00:00
if ticker in df['ticker'].unique():
company_row = df[df['ticker'] == ticker]
company_name = company_row.iloc[0]['company name']
sector = company_row.iloc[0]['industry']
ceo = company_row.iloc[0]['ceo']
logo = company_row.iloc[0]['logo']
return company_name, sector, ceo, logo
else:
x = "not available"
y = "not available"
z = "not available"
f = "not available"
return x, y, z, f
def get_stock_data(ticker):
stock = Ticker(ticker)
stock_data = stock.history(period='max', interval='1d')
ratios = stock.valuation_measures
earnings_annual = stock.balance_sheet(frequency="a")
earnings_last_4q = stock.balance_sheet(frequency='q')
stock_data['price change (%)'] = stock_data['close'].pct_change() * 100
2023-05-08 09:26:25 +00:00
pd.options.display.float_format = '{:,.2f}'.format
earnings_last_4q = earnings_last_4q
earnings_annual = earnings_annual
balance_sheet = earnings_annual
if isinstance(balance_sheet, str):
return
balance_sheet = pd.concat(
[earnings_annual, earnings_last_4q], ignore_index=True)
income_statement = stock.income_statement(frequency="a")
All_Data = "Companies_Data"
if not os.path.exists(All_Data):
os.mkdir(All_Data)
folder_name = os.path.join(All_Data, ticker + "_Data")
if not os.path.exists(folder_name):
os.mkdir(folder_name)
price_data = pd.DataFrame({'Closing Price': stock_data['close'], 'Price change (%)': stock_data['price change (%)']})
2023-05-08 09:26:25 +00:00
2023-05-13 14:36:24 +00:00
earnings_data = pd.DataFrame(get_earnings(ticker))
2023-05-08 09:26:25 +00:00
ratios_data = pd.DataFrame({})
balance_sheet_data = pd.DataFrame({})
company_name, sector, ceo, logo = get_company_data(ticker)
2023-05-19 21:17:19 +00:00
employee_number = get_employee_number(ticker)
2023-05-08 09:26:25 +00:00
company_data = pd.DataFrame({
'company_name': [company_name],
'sector': [sector],
'ceo': [ceo],
2023-05-19 21:17:19 +00:00
'logo': [logo],
'number_employees': [employee_number]
2023-05-08 09:26:25 +00:00
})
estimated_growth = get_analyst_estimates(ticker)
earnings_next5Y = pd.DataFrame({
'company_name': [ticker],
'5Y Growth estimate': [estimated_growth]
})
2023-05-10 09:35:58 +00:00
market_cap = get_market_cap(ticker)
market_cap_file = pd.DataFrame({
'company_name': [ticker],
'Market capitalization': [market_cap]
})
2023-05-08 09:26:25 +00:00
ratios_components = ['asOfDate', 'PeRatio',
'PegRatio', 'PsRatio', 'PbRatio']
balance_sheet_components = ['asOfDate', 'CurrentAssets',
'CurrentDebt', 'CashAndCashEquivalents', 'TotalAssets', 'TotalDebt']
for balance in balance_sheet_components:
if balance in balance_sheet.columns:
balance_sheet_data[balance] = balance_sheet[balance]
for ratio in ratios_components:
if ratio in ratios.columns:
ratios_data[ratio] = ratios[ratio]
price_data.to_csv(os.path.join(folder_name, ticker+"_price_history.csv"))
ratios_data.to_csv(os.path.join(folder_name, ticker+"_current_ratios.csv"))
balance_sheet_data.to_csv(os.path.join(folder_name, ticker + "_balance_sheet_4Y+4Q.csv"))
2023-05-08 09:26:25 +00:00
company_data.to_csv(os.path.join(folder_name, ticker + "meta_data.csv"))
earnings_next5Y.to_csv(os.path.join(folder_name, ticker + "5YGrowthEstimates.csv"))
2023-05-10 09:35:58 +00:00
market_cap_file.to_csv(os.path.join(folder_name, ticker + "MarketCap.csv"))
2023-05-13 14:36:24 +00:00
earnings_data.to_csv(os.path.join(folder_name, ticker + "earnings.csv"))
2023-05-08 09:26:25 +00:00
def download_all():
for company in programming_crime_list:
get_stock_data(company)
if __name__ == '__main__':
download_all()