#!/usr/bin/env python3
"""
BET.CUTTALO.COM - COMPREHENSIVE DATA COLLECTOR
Sistema completo per raccogliere TUTTI i dati calcistici disponibili:
- Top 5 leghe europee + seconde divisioni
- Champions League, Europa League, Conference League
- Coppe nazionali (FA Cup, Coppa Italia, Copa del Rey, DFB Pokal, Coupe de France)
- Leghe secondarie europee
- Dati storici e partite future
"""

import requests
import pandas as pd
import json
import os
from datetime import datetime, timedelta
from typing import List, Dict, Optional
import time
from io import StringIO
import logging

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

DATA_DIR = os.path.join(os.path.dirname(__file__), '..', 'data')
os.makedirs(DATA_DIR, exist_ok=True)


class ComprehensiveDataCollector:
    """
    Raccoglie dati da multiple fonti:
    1. football-data.co.uk - Dati storici gratuiti (1993-oggi)
    2. API-Football (se disponibile) - Dati live e fixtures
    3. OpenLigaDB - Dati gratuiti Bundesliga
    4. TheSportsDB - Dati gratuiti vari
    """

    BASE_URL = "https://www.football-data.co.uk"

    # TUTTI i campionati disponibili su football-data.co.uk
    ALL_LEAGUES = {
        # TOP 5 LEAGUES
        'E0': {'name': 'Premier League', 'country': 'England', 'flag': '🏴󠁧󠁢󠁥󠁮󠁧󠁿', 'tier': 1, 'priority': 1},
        'SP1': {'name': 'La Liga', 'country': 'Spain', 'flag': '🇪🇸', 'tier': 1, 'priority': 1},
        'I1': {'name': 'Serie A', 'country': 'Italy', 'flag': '🇮🇹', 'tier': 1, 'priority': 1},
        'D1': {'name': 'Bundesliga', 'country': 'Germany', 'flag': '🇩🇪', 'tier': 1, 'priority': 1},
        'F1': {'name': 'Ligue 1', 'country': 'France', 'flag': '🇫🇷', 'tier': 1, 'priority': 1},

        # SECOND DIVISIONS
        'E1': {'name': 'Championship', 'country': 'England', 'flag': '🏴󠁧󠁢󠁥󠁮󠁧󠁿', 'tier': 2, 'priority': 2},
        'E2': {'name': 'League One', 'country': 'England', 'flag': '🏴󠁧󠁢󠁥󠁮󠁧󠁿', 'tier': 3, 'priority': 3},
        'E3': {'name': 'League Two', 'country': 'England', 'flag': '🏴󠁧󠁢󠁥󠁮󠁧󠁿', 'tier': 4, 'priority': 3},
        'SP2': {'name': 'La Liga 2', 'country': 'Spain', 'flag': '🇪🇸', 'tier': 2, 'priority': 2},
        'I2': {'name': 'Serie B', 'country': 'Italy', 'flag': '🇮🇹', 'tier': 2, 'priority': 2},
        'D2': {'name': '2. Bundesliga', 'country': 'Germany', 'flag': '🇩🇪', 'tier': 2, 'priority': 2},
        'F2': {'name': 'Ligue 2', 'country': 'France', 'flag': '🇫🇷', 'tier': 2, 'priority': 2},

        # OTHER EUROPEAN LEAGUES
        'N1': {'name': 'Eredivisie', 'country': 'Netherlands', 'flag': '🇳🇱', 'tier': 1, 'priority': 2},
        'B1': {'name': 'Jupiler Pro League', 'country': 'Belgium', 'flag': '🇧🇪', 'tier': 1, 'priority': 2},
        'P1': {'name': 'Primeira Liga', 'country': 'Portugal', 'flag': '🇵🇹', 'tier': 1, 'priority': 2},
        'T1': {'name': 'Super Lig', 'country': 'Turkey', 'flag': '🇹🇷', 'tier': 1, 'priority': 2},
        'G1': {'name': 'Super League', 'country': 'Greece', 'flag': '🇬🇷', 'tier': 1, 'priority': 2},
        'SC0': {'name': 'Scottish Premiership', 'country': 'Scotland', 'flag': '🏴󠁧󠁢󠁳󠁣󠁴󠁿', 'tier': 1, 'priority': 2},
        'SC1': {'name': 'Scottish Championship', 'country': 'Scotland', 'flag': '🏴󠁧󠁢󠁳󠁣󠁴󠁿', 'tier': 2, 'priority': 3},

        # EASTERN EUROPE
        'RUS': {'name': 'Russian Premier League', 'country': 'Russia', 'flag': '🇷🇺', 'tier': 1, 'priority': 3},
        'POL': {'name': 'Ekstraklasa', 'country': 'Poland', 'flag': '🇵🇱', 'tier': 1, 'priority': 3},
        'SWZ': {'name': 'Super League', 'country': 'Switzerland', 'flag': '🇨🇭', 'tier': 1, 'priority': 3},
        'AUT': {'name': 'Bundesliga', 'country': 'Austria', 'flag': '🇦🇹', 'tier': 1, 'priority': 3},

        # SOUTH AMERICA (if available)
        'ARG': {'name': 'Primera Division', 'country': 'Argentina', 'flag': '🇦🇷', 'tier': 1, 'priority': 3},
        'BRA': {'name': 'Serie A', 'country': 'Brazil', 'flag': '🇧🇷', 'tier': 1, 'priority': 3},
    }

    # Coppe Europee (dati disponibili su football-data.co.uk come EC)
    EUROPEAN_CUPS = {
        'EC': {'name': 'UEFA Champions League', 'flag': '🏆', 'priority': 1},
    }

    # Stagioni disponibili (dal 2005 per avere dati consistenti)
    SEASONS = [
        '0506', '0607', '0708', '0809', '0910',
        '1011', '1112', '1213', '1314', '1415',
        '1516', '1617', '1718', '1819', '1920',
        '2021', '2122', '2223', '2324', '2425'
    ]

    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        self.stats = {
            'downloaded': 0,
            'failed': 0,
            'total_matches': 0
        }

    def _download_csv(self, league: str, season: str, is_cup: bool = False) -> pd.DataFrame:
        """Scarica CSV per un campionato e stagione"""
        if is_cup:
            url = f"{self.BASE_URL}/mmz4281/{season}/{league}.csv"
        else:
            url = f"{self.BASE_URL}/mmz4281/{season}/{league}.csv"

        try:
            response = self.session.get(url, timeout=30)
            if response.status_code == 404:
                return pd.DataFrame()
            response.raise_for_status()

            df = pd.read_csv(StringIO(response.text), encoding='utf-8', on_bad_lines='skip')

            # Standardize columns
            df = self._standardize_columns(df, league, season)

            return df
        except Exception as e:
            logger.debug(f"Error downloading {league} {season}: {e}")
            return pd.DataFrame()

    def _standardize_columns(self, df: pd.DataFrame, league: str, season: str) -> pd.DataFrame:
        """Standardizza le colonne del DataFrame"""
        if df.empty:
            return df

        # Mapping colonne comuni
        column_mapping = {
            'Date': 'date',
            'Time': 'time',
            'HomeTeam': 'home_team',
            'AwayTeam': 'away_team',
            'FTHG': 'home_goals',
            'FTAG': 'away_goals',
            'FTR': 'result',
            'HTHG': 'ht_home_goals',
            'HTAG': 'ht_away_goals',
            'HTR': 'ht_result',
            'HS': 'home_shots',
            'AS': 'away_shots',
            'HST': 'home_shots_target',
            'AST': 'away_shots_target',
            'HC': 'home_corners',
            'AC': 'away_corners',
            'HF': 'home_fouls',
            'AF': 'away_fouls',
            'HY': 'home_yellows',
            'AY': 'away_yellows',
            'HR': 'home_reds',
            'AR': 'away_reds',
            # Odds
            'B365H': 'odds_home',
            'B365D': 'odds_draw',
            'B365A': 'odds_away',
            'BWH': 'bw_odds_home',
            'BWD': 'bw_odds_draw',
            'BWA': 'bw_odds_away',
        }

        # Rinomina colonne
        df = df.rename(columns={k: v for k, v in column_mapping.items() if k in df.columns})

        # Aggiungi metadati
        df['league'] = league
        df['season'] = season

        # Aggiungi info lega
        if league in self.ALL_LEAGUES:
            df['league_name'] = self.ALL_LEAGUES[league]['name']
            df['country'] = self.ALL_LEAGUES[league]['country']
            df['flag'] = self.ALL_LEAGUES[league]['flag']
        elif league in self.EUROPEAN_CUPS:
            df['league_name'] = self.EUROPEAN_CUPS[league]['name']
            df['country'] = 'Europe'
            df['flag'] = self.EUROPEAN_CUPS[league]['flag']

        return df

    def collect_all_leagues(self,
                           priority_levels: List[int] = [1, 2],
                           start_season: str = '1516',
                           include_cups: bool = True) -> pd.DataFrame:
        """
        Raccoglie dati da tutti i campionati specificati

        Args:
            priority_levels: Lista di livelli di priorità da includere (1=top5, 2=secondarie, 3=altre)
            start_season: Stagione iniziale
            include_cups: Se includere le coppe europee
        """
        all_data = []

        # Filtra leghe per priorità
        leagues_to_fetch = {
            code: info for code, info in self.ALL_LEAGUES.items()
            if info['priority'] in priority_levels
        }

        # Aggiungi coppe se richiesto
        if include_cups:
            leagues_to_fetch.update(self.EUROPEAN_CUPS)

        # Trova indice stagione iniziale
        start_idx = self.SEASONS.index(start_season) if start_season in self.SEASONS else 0
        seasons_to_fetch = self.SEASONS[start_idx:]

        total_requests = len(leagues_to_fetch) * len(seasons_to_fetch)
        current = 0

        logger.info(f"Raccolta dati da {len(leagues_to_fetch)} campionati, {len(seasons_to_fetch)} stagioni...")
        logger.info(f"Totale richieste: {total_requests}")

        for league_code in leagues_to_fetch:
            is_cup = league_code in self.EUROPEAN_CUPS
            league_info = self.EUROPEAN_CUPS.get(league_code) or self.ALL_LEAGUES.get(league_code)

            for season in seasons_to_fetch:
                current += 1

                df = self._download_csv(league_code, season, is_cup)

                if not df.empty:
                    all_data.append(df)
                    self.stats['downloaded'] += 1
                    self.stats['total_matches'] += len(df)
                    logger.info(f"[{current}/{total_requests}] {league_info['name']} {season}: {len(df)} partite")
                else:
                    self.stats['failed'] += 1

                # Rate limiting
                time.sleep(0.3)

        if all_data:
            combined = pd.concat(all_data, ignore_index=True)
            logger.info(f"\nTotale: {len(combined)} partite da {self.stats['downloaded']} file")
            return combined

        return pd.DataFrame()

    def collect_fixtures_from_api(self) -> Dict:
        """
        Raccoglie partite future da API gratuite
        Usa TheSportsDB (gratuita) per fixtures
        """
        fixtures = {}

        # TheSportsDB API (gratuita)
        SPORTSDB_URL = "https://www.thesportsdb.com/api/v1/json/3"

        # League IDs su TheSportsDB
        LEAGUE_IDS = {
            '4328': {'code': 'E0', 'name': 'Premier League'},
            '4335': {'code': 'SP1', 'name': 'La Liga'},
            '4332': {'code': 'I1', 'name': 'Serie A'},
            '4331': {'code': 'D1', 'name': 'Bundesliga'},
            '4334': {'code': 'F1', 'name': 'Ligue 1'},
            '4480': {'code': 'EC', 'name': 'Champions League'},
            '4481': {'code': 'EL', 'name': 'Europa League'},
        }

        logger.info("Raccolta fixtures da TheSportsDB...")

        for league_id, info in LEAGUE_IDS.items():
            try:
                # Prossimi 15 eventi
                url = f"{SPORTSDB_URL}/eventsnextleague.php?id={league_id}"
                response = self.session.get(url, timeout=30)

                if response.status_code == 200:
                    data = response.json()
                    events = data.get('events') or []

                    for event in events:
                        fixture = {
                            'id': event.get('idEvent'),
                            'date': event.get('dateEvent'),
                            'time': event.get('strTime', '15:00:00')[:5],
                            'home_team': event.get('strHomeTeam'),
                            'away_team': event.get('strAwayTeam'),
                            'league': info['code'],
                            'league_name': info['name'],
                            'venue': event.get('strVenue'),
                            'round': event.get('intRound'),
                        }

                        if info['code'] not in fixtures:
                            fixtures[info['code']] = []
                        fixtures[info['code']].append(fixture)

                    logger.info(f"  {info['name']}: {len(events)} fixtures")

                time.sleep(0.5)

            except Exception as e:
                logger.error(f"Errore {info['name']}: {e}")

        return fixtures

    def save_data(self, df: pd.DataFrame, filename: str = 'comprehensive_matches.csv'):
        """Salva i dati raccolti"""
        filepath = os.path.join(DATA_DIR, filename)
        df.to_csv(filepath, index=False)
        logger.info(f"Dati salvati in: {filepath}")
        logger.info(f"Totale righe: {len(df)}")

        # Statistiche
        if 'league' in df.columns:
            league_stats = df.groupby('league').size().sort_values(ascending=False)
            logger.info("\nPartite per campionato:")
            for league, count in league_stats.head(10).items():
                name = self.ALL_LEAGUES.get(league, {}).get('name', league)
                logger.info(f"  {name}: {count}")

    def save_fixtures(self, fixtures: Dict, filename: str = 'all_fixtures.json'):
        """Salva fixtures in JSON"""
        filepath = os.path.join(DATA_DIR, filename)

        # Aggiungi timestamp
        data = {
            'generated_at': datetime.now().isoformat(),
            'fixtures': fixtures,
            'total_count': sum(len(f) for f in fixtures.values())
        }

        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)

        logger.info(f"Fixtures salvate in: {filepath}")

    def run_full_collection(self):
        """Esegue raccolta completa di tutti i dati"""
        logger.info("=" * 60)
        logger.info("RACCOLTA DATI COMPLETA")
        logger.info("=" * 60)

        # 1. Raccogli dati storici
        logger.info("\n[1/3] Raccolta dati storici...")
        historical = self.collect_all_leagues(
            priority_levels=[1, 2, 3],
            start_season='1011',
            include_cups=True
        )

        if not historical.empty:
            self.save_data(historical, 'comprehensive_matches.csv')

        # 2. Raccogli fixtures
        logger.info("\n[2/3] Raccolta fixtures future...")
        fixtures = self.collect_fixtures_from_api()
        self.save_fixtures(fixtures)

        # 3. Statistiche finali
        logger.info("\n[3/3] Statistiche finali:")
        logger.info(f"  File scaricati: {self.stats['downloaded']}")
        logger.info(f"  File falliti: {self.stats['failed']}")
        logger.info(f"  Partite totali: {self.stats['total_matches']}")

        return historical, fixtures


def main():
    collector = ComprehensiveDataCollector()
    collector.run_full_collection()


if __name__ == '__main__':
    main()
