"""
Store Sales Time Series Forecasting - Kaggle Competition
Author: vincenzorubino
Objective: Predict sales for Favorita stores in Ecuador
"""

import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_log_error
import warnings
warnings.filterwarnings('ignore')

print("="*70)
print("   Store Sales Time Series Forecasting")
print("   Kaggle Competition - vincenzorubino")
print("="*70)

# ============================================================================
# LOAD DATA
# ============================================================================
print("\n[1/6] Loading data...")

train = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/train.csv',
                    parse_dates=['date'])
test = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/test.csv',
                   parse_dates=['date'])
stores = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/stores.csv')
oil = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/oil.csv',
                  parse_dates=['date'])
holidays = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv',
                       parse_dates=['date'])
transactions = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/transactions.csv',
                           parse_dates=['date'])

print(f"  Train shape: {train.shape}")
print(f"  Test shape: {test.shape}")
print(f"  Date range: {train['date'].min()} to {train['date'].max()}")

# ============================================================================
# FEATURE ENGINEERING
# ============================================================================
print("\n[2/6] Feature engineering...")

def create_features(df, oil_df, stores_df, holidays_df):
    """Create time series features"""
    df = df.copy()

    # Date features
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.dayofweek
    df['dayofyear'] = df['date'].dt.dayofyear
    df['weekofyear'] = df['date'].dt.isocalendar().week.astype(int)
    df['quarter'] = df['date'].dt.quarter

    # Is weekend
    df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)

    # Is month start/end
    df['is_month_start'] = df['date'].dt.is_month_start.astype(int)
    df['is_month_end'] = df['date'].dt.is_month_end.astype(int)

    # Merge store info
    df = df.merge(stores_df, on='store_nbr', how='left')

    # Merge oil prices
    oil_df = oil_df.rename(columns={'dcoilwtico': 'oil_price'})
    oil_df['oil_price'] = oil_df['oil_price'].ffill().bfill()
    df = df.merge(oil_df, on='date', how='left')
    df['oil_price'] = df['oil_price'].ffill().bfill()

    # Holidays
    national_holidays = holidays_df[
        (holidays_df['locale'] == 'National') &
        (holidays_df['transferred'] == False)
    ][['date']].drop_duplicates()
    national_holidays['is_holiday'] = 1
    df = df.merge(national_holidays, on='date', how='left')
    df['is_holiday'] = df['is_holiday'].fillna(0).astype(int)

    # Encode categoricals
    df['family_encoded'] = df['family'].astype('category').cat.codes
    df['city_encoded'] = df['city'].astype('category').cat.codes
    df['state_encoded'] = df['state'].astype('category').cat.codes
    df['type_encoded'] = df['type'].astype('category').cat.codes

    return df

train = create_features(train, oil, stores, holidays)
test = create_features(test, oil, stores, holidays)

print(f"  Features created: {len(train.columns)} columns")

# ============================================================================
# LAG FEATURES (only for training data to avoid leakage)
# ============================================================================
print("\n[3/6] Creating lag features...")

# Group by store and family for lag features
def add_lag_features(df, lags=[7, 14, 28]):
    """Add lag features for sales"""
    df = df.sort_values(['store_nbr', 'family', 'date'])

    for lag in lags:
        df[f'sales_lag_{lag}'] = df.groupby(['store_nbr', 'family'])['sales'].shift(lag)

    # Rolling means
    df['sales_rolling_7'] = df.groupby(['store_nbr', 'family'])['sales'].transform(
        lambda x: x.shift(1).rolling(7, min_periods=1).mean()
    )
    df['sales_rolling_14'] = df.groupby(['store_nbr', 'family'])['sales'].transform(
        lambda x: x.shift(1).rolling(14, min_periods=1).mean()
    )
    df['sales_rolling_28'] = df.groupby(['store_nbr', 'family'])['sales'].transform(
        lambda x: x.shift(1).rolling(28, min_periods=1).mean()
    )

    return df

# Only add lags to training data
train = add_lag_features(train)

# For test, we'll use the last known values from training
last_sales = train.groupby(['store_nbr', 'family']).agg({
    'sales': ['last', 'mean']
}).reset_index()
last_sales.columns = ['store_nbr', 'family', 'last_sales', 'avg_sales']

test = test.merge(last_sales, on=['store_nbr', 'family'], how='left')
test['sales_lag_7'] = test['last_sales']
test['sales_lag_14'] = test['last_sales']
test['sales_lag_28'] = test['avg_sales']
test['sales_rolling_7'] = test['avg_sales']
test['sales_rolling_14'] = test['avg_sales']
test['sales_rolling_28'] = test['avg_sales']

print(f"  Lag features added")

# ============================================================================
# PREPARE TRAINING DATA
# ============================================================================
print("\n[4/6] Preparing model...")

# Use last 2 years of data for training (more recent = more relevant)
train_recent = train[train['date'] >= '2015-01-01'].copy()

# Drop rows with NaN in lag features
train_recent = train_recent.dropna()

feature_cols = [
    'store_nbr', 'onpromotion',
    'year', 'month', 'day', 'dayofweek', 'dayofyear', 'weekofyear', 'quarter',
    'is_weekend', 'is_month_start', 'is_month_end',
    'cluster', 'oil_price', 'is_holiday',
    'family_encoded', 'city_encoded', 'state_encoded', 'type_encoded',
    'sales_lag_7', 'sales_lag_14', 'sales_lag_28',
    'sales_rolling_7', 'sales_rolling_14', 'sales_rolling_28'
]

X_train = train_recent[feature_cols]
y_train = train_recent['sales']

# Transform target (log1p for RMSLE optimization)
y_train_log = np.log1p(y_train)

print(f"  Training samples: {len(X_train)}")
print(f"  Features: {len(feature_cols)}")

# ============================================================================
# TRAIN MODEL
# ============================================================================
print("\n[5/6] Training LightGBM model...")

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 127,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'n_jobs': -1,
    'seed': 42
}

# Time series split for validation
tscv = TimeSeriesSplit(n_splits=3)
models = []
scores = []

for fold, (train_idx, val_idx) in enumerate(tscv.split(X_train)):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train_log.iloc[train_idx], y_train_log.iloc[val_idx]

    train_data = lgb.Dataset(X_tr, label=y_tr)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[val_data],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]
    )

    # Calculate RMSLE on validation
    val_pred = np.expm1(model.predict(X_val))
    val_pred = np.maximum(val_pred, 0)  # No negative sales
    val_true = np.expm1(y_val)

    rmsle = np.sqrt(mean_squared_log_error(val_true + 1, val_pred + 1))
    scores.append(rmsle)
    models.append(model)

    print(f"  Fold {fold+1} RMSLE: {rmsle:.5f}")

print(f"\n  Mean CV RMSLE: {np.mean(scores):.5f} (+/- {np.std(scores):.5f})")

# ============================================================================
# MAKE PREDICTIONS
# ============================================================================
print("\n[6/6] Making predictions...")

X_test = test[feature_cols].fillna(0)

# Ensemble predictions from all folds
predictions = np.zeros(len(X_test))
for model in models:
    pred = np.expm1(model.predict(X_test))
    predictions += pred / len(models)

# Clip negative values
predictions = np.maximum(predictions, 0)

# Create submission
submission = pd.DataFrame({
    'id': test['id'],
    'sales': predictions
})

submission.to_csv('submission.csv', index=False)
print(f"  Submission saved: {len(submission)} predictions")
print(f"  Sales range: {predictions.min():.2f} - {predictions.max():.2f}")

# Feature importance
print("\n[Feature Importance - Top 10]")
importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': models[0].feature_importance()
}).sort_values('importance', ascending=False)
print(importance.head(10).to_string(index=False))

print("\n" + "="*70)
print("   DONE! submission.csv ready for upload")
print("="*70)
