#!/usr/bin/env python3
"""
Download additional leagues to expand training data
"""

import os
import sys
import pandas as pd
import requests
import time
from datetime import datetime

BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DATA_DIR = os.path.join(BASE_DIR, '..', 'data')

# Additional leagues not yet in our dataset
EXTRA_LEAGUES = {
    'E3': {'name': 'League Two', 'country': 'England'},
    'D2': {'name': '2. Bundesliga', 'country': 'Germany'},
    'I2': {'name': 'Serie B', 'country': 'Italy'},
    'SP2': {'name': 'La Liga 2', 'country': 'Spain'},
    'F2': {'name': 'Ligue 2', 'country': 'France'},
    'T1': {'name': 'Super Lig', 'country': 'Turkey'},
    'G1': {'name': 'Super League', 'country': 'Greece'},
}

SEASONS = ['1516', '1617', '1718', '1819', '1920', '2021', '2122', '2223', '2324', '2425']

def download_csv(url):
    """Download CSV with retries"""
    for attempt in range(3):
        try:
            df = pd.read_csv(url, encoding='utf-8', on_bad_lines='skip')
            return df
        except Exception as e:
            if attempt < 2:
                time.sleep(1)
                continue
            return None

def standardize_columns(df, league, season):
    """Standardize column names"""
    col_mapping = {
        'Div': 'league_code',
        'Date': 'date',
        'HomeTeam': 'home_team',
        'AwayTeam': 'away_team',
        'FTHG': 'home_goals',
        'FTAG': 'away_goals',
        'FTR': 'result',
        'HTHG': 'ht_home_goals',
        'HTAG': 'ht_away_goals',
        'HTR': 'ht_result',
        'HS': 'home_shots',
        'AS': 'away_shots',
        'HST': 'home_shots_target',
        'AST': 'away_shots_target',
        'HF': 'home_fouls',
        'AF': 'away_fouls',
        'HC': 'home_corners',
        'AC': 'away_corners',
        'HY': 'home_yellow',
        'AY': 'away_yellow',
        'HR': 'home_red',
        'AR': 'away_red',
        'B365H': 'odds_home_b365',
        'B365D': 'odds_draw_b365',
        'B365A': 'odds_away_b365',
        'BWH': 'odds_home_bw',
        'BWD': 'odds_draw_bw',
        'BWA': 'odds_away_bw',
    }

    df = df.rename(columns=col_mapping)

    if 'league_code' not in df.columns:
        df['league_code'] = league

    # Add season
    season_str = f"20{season[:2]}/20{season[2:]}"
    df['season'] = season_str

    return df

def main():
    print("=" * 60)
    print("BetPredictAI - Additional Leagues Download")
    print("=" * 60)

    # Load existing data
    existing_file = os.path.join(DATA_DIR, 'advanced_historical_matches.csv')
    if os.path.exists(existing_file):
        existing_df = pd.read_csv(existing_file, low_memory=False)
        print(f"\nExisting data: {len(existing_df):,} matches")

        # Check which leagues we already have
        existing_leagues = set(existing_df['league_code'].unique())
        print(f"Existing leagues: {', '.join(sorted(existing_leagues))}")
    else:
        existing_df = pd.DataFrame()
        existing_leagues = set()

    # Determine which leagues to download
    leagues_to_download = {k: v for k, v in EXTRA_LEAGUES.items() if k not in existing_leagues}

    if not leagues_to_download:
        print("\nAll extra leagues already downloaded!")
        return

    print(f"\nLeagues to download: {', '.join(leagues_to_download.keys())}")

    all_new_data = []

    for league, info in leagues_to_download.items():
        print(f"\n{'='*40}")
        print(f"Downloading: {info['name']} ({info['country']})")
        print('='*40)

        for season in SEASONS:
            url = f"https://www.football-data.co.uk/mmz4281/{season}/{league}.csv"
            print(f"  Season 20{season[:2]}/20{season[2:]}: ", end='', flush=True)

            df = download_csv(url)
            if df is not None and len(df) > 0:
                df = standardize_columns(df, league, season)
                all_new_data.append(df)
                print(f"✓ {len(df)} matches")
            else:
                print("✗ No data")

            time.sleep(0.5)  # Rate limiting

    if not all_new_data:
        print("\nNo new data collected")
        return

    # Combine new data
    new_df = pd.concat(all_new_data, ignore_index=True)
    print(f"\n✓ New data collected: {len(new_df):,} matches")

    # Merge with existing
    if len(existing_df) > 0:
        # Ensure same columns
        common_cols = list(set(existing_df.columns) & set(new_df.columns))
        existing_df = existing_df[common_cols]
        new_df = new_df[common_cols]

        combined_df = pd.concat([existing_df, new_df], ignore_index=True)
    else:
        combined_df = new_df

    # Save
    combined_df.to_csv(existing_file, index=False)
    print(f"\n✓ Total data saved: {len(combined_df):,} matches")

    # Show final stats
    print("\n" + "=" * 60)
    print("Final Statistics")
    print("=" * 60)
    print(f"\nMatches per league:")
    for league, count in combined_df['league_code'].value_counts().sort_values(ascending=False).items():
        print(f"  {league}: {count:,}")

if __name__ == '__main__':
    main()
