Average holdings and positions from over 400 different hedge funds.
Report | Description | CSV | |
---|---|---|---|
Top 10 | Average Hedge Fund Portfolio (Top 10 Holdings) | View PDF | View CSV |
Top 20 | Average Hedge Fund Portfolio (Top 20 Holdings) | View PDF | View CSV |
Top 50 | Average Hedge Fund Portfolio (Top 50 Holdings) | View PDF | View CSV |
Top 100 | Average Hedge Fund Portfolio (Top 100 Holdings) | View PDF | View CSV |
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from fpdf import FPDF
from datetime import date
import os
import time
from tqdm import tqdm
# -------------------------------
# Step 1. Scrape list of hedge funds
# -------------------------------
def scrape_hedge_fund_list(url, num_of_results=150, out_csv='list_of_hedge_funds.csv'):
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto(url)
time.sleep(2)
data = []
for _ in tqdm(range(0, num_of_results, 50)):
time.sleep(2)
page.wait_for_selector("table tbody tr")
for row in page.query_selector_all("table tbody tr"):
data.append([
row.text_content().replace(' ', '').split('\n')[1],
url + BeautifulSoup(row.inner_html(), "html.parser").find("a")["href"]
])
page.get_by_label('Next page').click()
browser.close()
df = pd.DataFrame(data, columns=['name', 'url'])
df.to_csv(out_csv, index=False)
print(df.head(), '\n', df.shape)
return out_csv
# -------------------------------
# Step 2. Scrape holdings for each fund
# -------------------------------
def scrape_holdings_for_funds(fund_list_csv, output_folder='hedge_fund_data/'):
os.makedirs(output_folder, exist_ok=True)
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
context = browser.new_context()
context.clear_cookies()
def getHoldings(name, url):
page = context.new_page()
page.goto(url)
time.sleep(2)
try:
page.get_by_role("tab", name="Holdings", exact=True).click()
time.sleep(2)
soup = BeautifulSoup(page.query_selector_all("tbody")[-1].inner_html(), "html.parser")
rows = []
for tr in soup.find_all("tr"):
cells = [td.get_text(strip=True) for td in tr.find_all("td")]
rows.append(cells)
df = pd.DataFrame(rows, columns=[
'Stock', 'rm0', 'Sector', 'Shares Held', 'Market Value', '% of Portfolio', 'Previous % of Portfolio',
'Rank', 'Change in Shares', '% Change', '% Ownership', 'Qtr 1st Owned', 'Est. Avg Price',
'Qtr End Price', 'rm', 'rm2', 'rm3', 'rm4', 'Source Date', 'Date Reported'
])
df = df.drop(columns=['rm0', 'rm', 'rm2', 'rm3', 'rm4'])
df.to_csv(f"{output_folder}/{name}_data.csv", index=False)
except Exception as e:
print("Error:", name, e)
page.close()
data = pd.read_csv(fund_list_csv)
for i, fund in tqdm(data.iterrows(), total=len(data)):
try:
getHoldings(fund['name'], fund['url'])
except Exception as e:
print("Error:", fund['name'], e)
browser.close()
# -------------------------------
# Step 3. Load and clean data
# -------------------------------
def load_and_clean_data(folder: str) -> pd.DataFrame:
dfs = []
for file in os.listdir(folder):
file_path = os.path.join(folder, file)
file_df = pd.read_csv(file_path)[['Stock', 'Sector', 'Shares Held', 'Market Value']]
for col in ['Shares Held', 'Market Value']:
file_df[col] = (
file_df[col].astype(str)
.str.replace(',', '', regex=False)
.astype(float)
)
dfs.append(file_df)
combined_df = pd.concat(dfs, ignore_index=True)
combined_df['Stock'] = combined_df['Stock'].astype(str).str.strip().str.upper()
result = (
combined_df
.groupby(['Stock', 'Sector'], as_index=False)[['Shares Held', 'Market Value']]
.sum()
)
result = result[~result["Stock"].str.contains("CALL|PUT", case=False, na=False)]
result = result[result['Market Value'] != 0]
return result.sort_values(by='Market Value', ascending=False, ignore_index=True)
# -------------------------------
# Step 4. Create a "Top N" slice
# -------------------------------
def prepare_top_n(df: pd.DataFrame, n: int) -> pd.DataFrame:
top_df = df.head(n).copy()
top_df.insert(0, "#", np.arange(1, len(top_df) + 1))
total_value = round(top_df['Market Value'].sum(), -10)
top_df['Market Value Percent'] = top_df['Market Value'] / total_value
return top_df
# -------------------------------
# Step 5. Generate report (PDF + CSV)
# -------------------------------
def generate_report(input_df: pd.DataFrame, output_dir: str, label: str) -> str:
df = input_df.copy()
os.makedirs(output_dir, exist_ok=True)
df['Round %'] = (df['Market Value Percent'] * 100).round(2)
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=20)
pdf.cell(200, 10, txt=f"Average Hedge Fund Portfolio (Top {label})", ln=True, align="C")
pdf.set_font("Arial", size=12)
pdf.cell(200, 5, txt=str(date.today()), ln=True, align="C")
pdf.set_font("Arial", size=9)
pdf.cell(200, 5, txt='© Max Lawton', ln=True, align="C")
pdf.ln(6)
header = df.columns.tolist()
data = df.values.tolist()
col_widths = [5, 15] + [40] * (len(header) - 3) + [15]
pdf.set_font('Arial', 'B', 8)
for i, col in enumerate(header):
pdf.cell(col_widths[i], 5, col, border=1, align="C")
pdf.ln()
pdf.set_font("Arial", size=5)
for row in data:
for i, val in enumerate(row):
pdf.cell(col_widths[i], 5, str(val), border=1, align="C")
pdf.ln()
pdf_filename = os.path.join(output_dir, f"AHP_top{label}.pdf")
csv_filename = os.path.join(output_dir, f"AHP_top{label}.csv")
pdf.output(pdf_filename)
df.to_csv(csv_filename, index=False)
print(f"Report generated: {pdf_filename}")
return pdf_filename
# -------------------------------
# Step 6. Run full pipeline
# -------------------------------
if __name__ == "__main__":
# Step 1: Scrape hedge fund list
fund_list_csv = scrape_hedge_fund_list("https://whalewisdom.com", num_of_results=150)
# Step 2: Scrape holdings for each fund
scrape_holdings_for_funds(fund_list_csv, output_folder='hedge_fund_data')
# Step 3: Load and clean all hedge fund data
combined = load_and_clean_data('hedge_fund_data')
# Step 4: Generate reports for multiple cutoffs
for n in [10, 20, 50, 100, 200]:
top_df = prepare_top_n(combined, n)
generate_report(top_df, "output", label=str(n))