#!/usr/bin/env python3
"""
Advanced Data Collector for BetPredictAI

Collects comprehensive football data including:
- Historical match results (extended)
- League standings/tables
- Team statistics
- Head-to-head records
- Goal patterns
- Home/Away performance

Data sources:
- football-data.co.uk (primary, free CSVs)
- OpenLigaDB (Bundesliga fixtures, free API)
"""

import os
import json
import pandas as pd
import numpy as np
import requests
from datetime import datetime, timedelta
from typing import Dict, List, Tuple, Optional
import time

# Configuration
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DATA_DIR = os.path.join(BASE_DIR, '..', 'data')

# Football-data.co.uk league codes and URLs
FOOTBALL_DATA_LEAGUES = {
    'E0': {'name': 'Premier League', 'country': 'England', 'flag': '🏴󠁧󠁢󠁥󠁮󠁧󠁿'},
    'E1': {'name': 'Championship', 'country': 'England', 'flag': '🏴󠁧󠁢󠁥󠁮󠁧󠁿'},
    'E2': {'name': 'League One', 'country': 'England', 'flag': '🏴󠁧󠁢󠁥󠁮󠁧󠁿'},
    'E3': {'name': 'League Two', 'country': 'England', 'flag': '🏴󠁧󠁢󠁥󠁮󠁧󠁿'},
    'D1': {'name': 'Bundesliga', 'country': 'Germany', 'flag': '🇩🇪'},
    'D2': {'name': '2. Bundesliga', 'country': 'Germany', 'flag': '🇩🇪'},
    'I1': {'name': 'Serie A', 'country': 'Italy', 'flag': '🇮🇹'},
    'I2': {'name': 'Serie B', 'country': 'Italy', 'flag': '🇮🇹'},
    'SP1': {'name': 'La Liga', 'country': 'Spain', 'flag': '🇪🇸'},
    'SP2': {'name': 'La Liga 2', 'country': 'Spain', 'flag': '🇪🇸'},
    'F1': {'name': 'Ligue 1', 'country': 'France', 'flag': '🇫🇷'},
    'F2': {'name': 'Ligue 2', 'country': 'France', 'flag': '🇫🇷'},
    'N1': {'name': 'Eredivisie', 'country': 'Netherlands', 'flag': '🇳🇱'},
    'B1': {'name': 'Pro League', 'country': 'Belgium', 'flag': '🇧🇪'},
    'P1': {'name': 'Primeira Liga', 'country': 'Portugal', 'flag': '🇵🇹'},
    'T1': {'name': 'Super Lig', 'country': 'Turkey', 'flag': '🇹🇷'},
    'G1': {'name': 'Super League', 'country': 'Greece', 'flag': '🇬🇷'},
    'SC0': {'name': 'Premiership', 'country': 'Scotland', 'flag': '🏴󠁧󠁢󠁳󠁣󠁴󠁿'},
}

# Seasons to collect (extended range)
SEASONS = ['1516', '1617', '1718', '1819', '1920', '2021', '2122', '2223', '2324', '2425']

# Core leagues for main training
CORE_LEAGUES = ['E0', 'D1', 'I1', 'SP1', 'F1']

# Extended leagues for more data
EXTENDED_LEAGUES = ['E1', 'E2', 'N1', 'B1', 'P1', 'SC0']


class AdvancedDataCollector:
    """Collects and processes comprehensive football data"""

    def __init__(self):
        os.makedirs(DATA_DIR, exist_ok=True)
        self.all_matches = []
        self.standings = {}
        self.team_stats = {}

    def download_csv(self, url: str, retries: int = 3) -> Optional[pd.DataFrame]:
        """Download CSV with retries and error handling"""
        for attempt in range(retries):
            try:
                df = pd.read_csv(url, encoding='utf-8', on_bad_lines='skip')
                return df
            except Exception as e:
                if attempt < retries - 1:
                    time.sleep(1)
                    continue
                print(f"  Error downloading {url}: {e}")
                return None

    def collect_historical_matches(self, leagues: List[str] = None, seasons: List[str] = None) -> pd.DataFrame:
        """Collect all historical match data"""
        if leagues is None:
            leagues = CORE_LEAGUES
        if seasons is None:
            seasons = SEASONS

        all_data = []

        for league in leagues:
            league_info = FOOTBALL_DATA_LEAGUES.get(league, {})
            print(f"\n📊 Collecting {league_info.get('name', league)} ({league_info.get('country', 'Unknown')})")

            for season in seasons:
                url = f"https://www.football-data.co.uk/mmz4281/{season}/{league}.csv"
                print(f"  Season 20{season[:2]}/20{season[2:]}: ", end='', flush=True)

                df = self.download_csv(url)
                if df is not None and len(df) > 0:
                    # Standardize columns
                    df = self._standardize_columns(df, league, season)
                    all_data.append(df)
                    print(f"✓ {len(df)} matches")
                else:
                    print("✗ No data")

                time.sleep(0.3)  # Rate limiting

        if all_data:
            combined = pd.concat(all_data, ignore_index=True)
            print(f"\n✓ Total: {len(combined)} matches collected")
            return combined
        return pd.DataFrame()

    def _standardize_columns(self, df: pd.DataFrame, league: str, season: str) -> pd.DataFrame:
        """Standardize DataFrame columns"""
        # Essential columns mapping
        col_mapping = {
            'Div': 'league_code',
            'Date': 'date',
            'HomeTeam': 'home_team',
            'AwayTeam': 'away_team',
            'FTHG': 'home_goals',
            'FTAG': 'away_goals',
            'FTR': 'result',
            'HTHG': 'ht_home_goals',
            'HTAG': 'ht_away_goals',
            'HTR': 'ht_result',
            'HS': 'home_shots',
            'AS': 'away_shots',
            'HST': 'home_shots_target',
            'AST': 'away_shots_target',
            'HF': 'home_fouls',
            'AF': 'away_fouls',
            'HC': 'home_corners',
            'AC': 'away_corners',
            'HY': 'home_yellow',
            'AY': 'away_yellow',
            'HR': 'home_red',
            'AR': 'away_red',
            # Betting odds
            'B365H': 'odds_home_b365',
            'B365D': 'odds_draw_b365',
            'B365A': 'odds_away_b365',
            'BWH': 'odds_home_bw',
            'BWD': 'odds_draw_bw',
            'BWA': 'odds_away_bw',
            'PSH': 'odds_home_ps',
            'PSD': 'odds_draw_ps',
            'PSA': 'odds_away_ps',
        }

        # Rename columns that exist
        rename_dict = {k: v for k, v in col_mapping.items() if k in df.columns}
        df = df.rename(columns=rename_dict)

        # Add metadata
        df['league_code'] = league
        df['season'] = f"20{season[:2]}/20{season[2:]}"

        league_info = FOOTBALL_DATA_LEAGUES.get(league, {})
        df['league_name'] = league_info.get('name', league)
        df['country'] = league_info.get('country', 'Unknown')
        df['flag'] = league_info.get('flag', '')

        # Parse date
        df['date'] = pd.to_datetime(df['date'], dayfirst=True, errors='coerce')

        return df

    def calculate_standings(self, df: pd.DataFrame) -> Dict:
        """Calculate league standings from match data"""
        standings = {}

        for league in df['league_code'].unique():
            league_df = df[df['league_code'] == league].copy()

            for season in league_df['season'].unique():
                season_df = league_df[league_df['season'] == season]

                teams = {}
                for _, match in season_df.iterrows():
                    home = match['home_team']
                    away = match['away_team']
                    hg = match.get('home_goals', 0) or 0
                    ag = match.get('away_goals', 0) or 0

                    for team in [home, away]:
                        if team not in teams:
                            teams[team] = {
                                'played': 0, 'won': 0, 'drawn': 0, 'lost': 0,
                                'goals_for': 0, 'goals_against': 0, 'points': 0
                            }

                    # Update home team
                    teams[home]['played'] += 1
                    teams[home]['goals_for'] += hg
                    teams[home]['goals_against'] += ag

                    # Update away team
                    teams[away]['played'] += 1
                    teams[away]['goals_for'] += ag
                    teams[away]['goals_against'] += hg

                    if hg > ag:
                        teams[home]['won'] += 1
                        teams[home]['points'] += 3
                        teams[away]['lost'] += 1
                    elif hg < ag:
                        teams[away]['won'] += 1
                        teams[away]['points'] += 3
                        teams[home]['lost'] += 1
                    else:
                        teams[home]['drawn'] += 1
                        teams[away]['drawn'] += 1
                        teams[home]['points'] += 1
                        teams[away]['points'] += 1

                # Calculate goal difference and sort
                for team in teams:
                    teams[team]['goal_diff'] = teams[team]['goals_for'] - teams[team]['goals_against']

                sorted_teams = sorted(teams.items(),
                    key=lambda x: (x[1]['points'], x[1]['goal_diff'], x[1]['goals_for']),
                    reverse=True)

                standings[f"{league}_{season}"] = {
                    'league': league,
                    'season': season,
                    'table': [{'position': i+1, 'team': t[0], **t[1]} for i, t in enumerate(sorted_teams)]
                }

        return standings

    def calculate_team_stats(self, df: pd.DataFrame, last_n_matches: int = 5) -> pd.DataFrame:
        """Calculate rolling team statistics for each match"""
        stats_columns = []

        # Sort by date
        df = df.sort_values('date').reset_index(drop=True)

        # Group by team
        team_history = {}

        for idx, row in df.iterrows():
            home = row['home_team']
            away = row['away_team']
            date = row['date']
            league = row['league_code']

            # Initialize team histories
            for team in [home, away]:
                key = f"{league}_{team}"
                if key not in team_history:
                    team_history[key] = {
                        'matches': [],
                        'home_matches': [],
                        'away_matches': [],
                        'goals_scored': [],
                        'goals_conceded': [],
                        'results': []
                    }

            home_key = f"{league}_{home}"
            away_key = f"{league}_{away}"

            # Calculate stats from last N matches
            home_stats = self._calc_rolling_stats(team_history[home_key], last_n_matches)
            away_stats = self._calc_rolling_stats(team_history[away_key], last_n_matches)

            # Calculate H2H stats
            h2h_stats = self._calc_h2h_stats(df, home, away, date, last_n=5)

            # Store stats
            stats_columns.append({
                'idx': idx,
                **{f'home_{k}': v for k, v in home_stats.items()},
                **{f'away_{k}': v for k, v in away_stats.items()},
                **h2h_stats
            })

            # Update histories after calculating stats (to avoid data leakage)
            hg = row.get('home_goals', 0) or 0
            ag = row.get('away_goals', 0) or 0

            # Home team update
            team_history[home_key]['matches'].append({
                'date': date, 'goals_for': hg, 'goals_against': ag,
                'is_home': True, 'opponent': away,
                'result': 'W' if hg > ag else ('D' if hg == ag else 'L')
            })
            team_history[home_key]['home_matches'].append(team_history[home_key]['matches'][-1])

            # Away team update
            team_history[away_key]['matches'].append({
                'date': date, 'goals_for': ag, 'goals_against': hg,
                'is_home': False, 'opponent': home,
                'result': 'W' if ag > hg else ('D' if ag == hg else 'L')
            })
            team_history[away_key]['away_matches'].append(team_history[away_key]['matches'][-1])

        # Create stats DataFrame and merge
        stats_df = pd.DataFrame(stats_columns)
        df = df.reset_index(drop=True)

        for col in stats_df.columns:
            if col != 'idx':
                df[col] = stats_df[col]

        return df

    def _calc_rolling_stats(self, history: Dict, n: int) -> Dict:
        """Calculate rolling statistics from team history"""
        matches = history['matches'][-n:] if history['matches'] else []
        home_matches = history['home_matches'][-n:] if history['home_matches'] else []
        away_matches = history['away_matches'][-n:] if history['away_matches'] else []

        if not matches:
            return {
                'form_ppg': 0, 'form_gpg': 0, 'form_gapg': 0, 'form_win_rate': 0,
                'home_ppg': 0, 'home_gpg': 0, 'away_ppg': 0, 'away_gpg': 0,
                'form_wins': 0, 'form_draws': 0, 'form_losses': 0,
                'streak': 0, 'clean_sheets': 0, 'failed_to_score': 0
            }

        # General form
        wins = sum(1 for m in matches if m['result'] == 'W')
        draws = sum(1 for m in matches if m['result'] == 'D')
        losses = sum(1 for m in matches if m['result'] == 'L')
        points = wins * 3 + draws
        goals_for = sum(m['goals_for'] for m in matches)
        goals_against = sum(m['goals_against'] for m in matches)

        # Streak (positive = wins, negative = losses)
        streak = 0
        for m in reversed(matches):
            if m['result'] == 'W':
                if streak >= 0:
                    streak += 1
                else:
                    break
            elif m['result'] == 'L':
                if streak <= 0:
                    streak -= 1
                else:
                    break
            else:
                break

        # Home/Away specific
        home_ppg = sum(3 if m['result'] == 'W' else (1 if m['result'] == 'D' else 0) for m in home_matches) / len(home_matches) if home_matches else 0
        home_gpg = sum(m['goals_for'] for m in home_matches) / len(home_matches) if home_matches else 0
        away_ppg = sum(3 if m['result'] == 'W' else (1 if m['result'] == 'D' else 0) for m in away_matches) / len(away_matches) if away_matches else 0
        away_gpg = sum(m['goals_for'] for m in away_matches) / len(away_matches) if away_matches else 0

        return {
            'form_ppg': points / len(matches),
            'form_gpg': goals_for / len(matches),
            'form_gapg': goals_against / len(matches),
            'form_win_rate': wins / len(matches),
            'home_ppg': home_ppg,
            'home_gpg': home_gpg,
            'away_ppg': away_ppg,
            'away_gpg': away_gpg,
            'form_wins': wins,
            'form_draws': draws,
            'form_losses': losses,
            'streak': streak,
            'clean_sheets': sum(1 for m in matches if m['goals_against'] == 0),
            'failed_to_score': sum(1 for m in matches if m['goals_for'] == 0)
        }

    def _calc_h2h_stats(self, df: pd.DataFrame, home: str, away: str, before_date, last_n: int = 5) -> Dict:
        """Calculate head-to-head statistics"""
        h2h_matches = df[
            (((df['home_team'] == home) & (df['away_team'] == away)) |
             ((df['home_team'] == away) & (df['away_team'] == home))) &
            (df['date'] < before_date)
        ].tail(last_n)

        if len(h2h_matches) == 0:
            return {
                'h2h_matches': 0, 'h2h_home_wins': 0, 'h2h_draws': 0, 'h2h_away_wins': 0,
                'h2h_home_goals': 0, 'h2h_away_goals': 0, 'h2h_total_goals': 0
            }

        home_wins = 0
        away_wins = 0
        draws = 0
        home_goals = 0
        away_goals = 0

        for _, match in h2h_matches.iterrows():
            hg = match.get('home_goals', 0) or 0
            ag = match.get('away_goals', 0) or 0

            if match['home_team'] == home:
                home_goals += hg
                away_goals += ag
                if hg > ag:
                    home_wins += 1
                elif hg < ag:
                    away_wins += 1
                else:
                    draws += 1
            else:
                home_goals += ag
                away_goals += hg
                if ag > hg:
                    home_wins += 1
                elif ag < hg:
                    away_wins += 1
                else:
                    draws += 1

        n = len(h2h_matches)
        return {
            'h2h_matches': n,
            'h2h_home_wins': home_wins / n if n > 0 else 0,
            'h2h_draws': draws / n if n > 0 else 0,
            'h2h_away_wins': away_wins / n if n > 0 else 0,
            'h2h_home_goals': home_goals / n if n > 0 else 0,
            'h2h_away_goals': away_goals / n if n > 0 else 0,
            'h2h_total_goals': (home_goals + away_goals) / n if n > 0 else 0
        }

    def calculate_advanced_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Calculate advanced features for ML model"""

        # Implied probabilities from betting odds
        for bookmaker in ['b365', 'bw', 'ps']:
            h_col = f'odds_home_{bookmaker}'
            d_col = f'odds_draw_{bookmaker}'
            a_col = f'odds_away_{bookmaker}'

            if all(col in df.columns for col in [h_col, d_col, a_col]):
                df[f'impl_home_{bookmaker}'] = 1 / df[h_col].replace(0, np.nan)
                df[f'impl_draw_{bookmaker}'] = 1 / df[d_col].replace(0, np.nan)
                df[f'impl_away_{bookmaker}'] = 1 / df[a_col].replace(0, np.nan)

                # Normalize to probabilities
                total = df[f'impl_home_{bookmaker}'] + df[f'impl_draw_{bookmaker}'] + df[f'impl_away_{bookmaker}']
                df[f'impl_home_{bookmaker}'] = df[f'impl_home_{bookmaker}'] / total
                df[f'impl_draw_{bookmaker}'] = df[f'impl_draw_{bookmaker}'] / total
                df[f'impl_away_{bookmaker}'] = df[f'impl_away_{bookmaker}'] / total

        # Average implied probabilities
        impl_home_cols = [c for c in df.columns if c.startswith('impl_home_')]
        impl_draw_cols = [c for c in df.columns if c.startswith('impl_draw_')]
        impl_away_cols = [c for c in df.columns if c.startswith('impl_away_')]

        if impl_home_cols:
            df['impl_home'] = df[impl_home_cols].mean(axis=1)
            df['impl_draw'] = df[impl_draw_cols].mean(axis=1)
            df['impl_away'] = df[impl_away_cols].mean(axis=1)

        # Form differentials
        if 'home_form_ppg' in df.columns and 'away_form_ppg' in df.columns:
            df['ppg_diff'] = df['home_form_ppg'] - df['away_form_ppg']
            df['attack_diff'] = df['home_form_gpg'] - df['away_form_gpg']
            df['defense_diff'] = df['away_form_gapg'] - df['home_form_gapg']
            df['win_rate_diff'] = df['home_form_win_rate'] - df['away_form_win_rate']

            # Home/Away specific differentials
            df['home_home_vs_away_away'] = df['home_home_ppg'] - df['away_away_ppg']

            # Streak comparison
            df['streak_diff'] = df['home_streak'] - df['away_streak']

        # Goal patterns
        if 'home_goals' in df.columns:
            df['total_goals'] = df['home_goals'].fillna(0) + df['away_goals'].fillna(0)
            df['goal_diff'] = df['home_goals'].fillna(0) - df['away_goals'].fillna(0)

        # Result encoding
        df['result_code'] = df['result'].map({'H': 2, 'D': 1, 'A': 0})

        return df

    def save_data(self, df: pd.DataFrame, filename: str):
        """Save DataFrame to CSV"""
        path = os.path.join(DATA_DIR, filename)
        df.to_csv(path, index=False)
        print(f"✓ Saved {len(df)} records to {filename}")

    def save_standings(self, standings: Dict, filename: str):
        """Save standings to JSON"""
        path = os.path.join(DATA_DIR, filename)
        with open(path, 'w') as f:
            json.dump(standings, f, indent=2, default=str)
        print(f"✓ Saved standings for {len(standings)} league-seasons to {filename}")

    def run_full_collection(self, extended: bool = False):
        """Run complete data collection pipeline"""
        print("="*60)
        print("🚀 BetPredictAI - Advanced Data Collection")
        print("="*60)

        leagues = CORE_LEAGUES + (EXTENDED_LEAGUES if extended else [])

        # 1. Collect historical matches
        print("\n📥 Phase 1: Collecting Historical Matches")
        print("-"*40)
        df = self.collect_historical_matches(leagues, SEASONS)

        if len(df) == 0:
            print("❌ No data collected!")
            return

        # 2. Calculate team statistics
        print("\n📊 Phase 2: Calculating Team Statistics")
        print("-"*40)
        df = self.calculate_team_stats(df, last_n_matches=5)
        print(f"✓ Calculated rolling stats for {len(df)} matches")

        # 3. Calculate advanced features
        print("\n🧮 Phase 3: Calculating Advanced Features")
        print("-"*40)
        df = self.calculate_advanced_features(df)
        print(f"✓ Added advanced features ({len(df.columns)} total columns)")

        # 4. Calculate standings
        print("\n🏆 Phase 4: Calculating Standings")
        print("-"*40)
        standings = self.calculate_standings(df)
        print(f"✓ Calculated standings for {len(standings)} league-seasons")

        # 5. Save everything
        print("\n💾 Phase 5: Saving Data")
        print("-"*40)
        self.save_data(df, 'advanced_historical_matches.csv')
        self.save_standings(standings, 'standings.json')

        # Summary
        print("\n" + "="*60)
        print("✅ DATA COLLECTION COMPLETE")
        print("="*60)
        print(f"  Total matches: {len(df):,}")
        print(f"  Leagues: {len(df['league_code'].unique())}")
        print(f"  Seasons: {len(df['season'].unique())}")
        print(f"  Teams: {len(set(df['home_team'].unique()) | set(df['away_team'].unique()))}")
        print(f"  Features: {len(df.columns)}")

        # Feature summary
        feature_cols = [c for c in df.columns if c.startswith(('home_', 'away_', 'h2h_', 'impl_', 'ppg_', 'attack_', 'defense_'))]
        print(f"\n  ML Features available: {len(feature_cols)}")

        return df


def main():
    collector = AdvancedDataCollector()

    # Run with extended leagues for maximum data
    df = collector.run_full_collection(extended=True)

    if df is not None:
        # Show sample of features
        print("\n📋 Sample Features:")
        feature_cols = ['home_form_ppg', 'away_form_ppg', 'h2h_home_wins', 'impl_home', 'ppg_diff']
        available = [c for c in feature_cols if c in df.columns]
        if available:
            print(df[available].describe())


if __name__ == '__main__':
    main()
