"""
Store Sales V5 - Based on V1 (CV 0.347) + Improvements
"""

import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_log_error
import warnings
warnings.filterwarnings('ignore')

print("="*70)
print("   Store Sales V5 - Improved Baseline")
print("="*70)

# Load data
print("\n[1/6] Loading data...")
train = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/train.csv', parse_dates=['date'])
test = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/test.csv', parse_dates=['date'])
stores = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/stores.csv')
oil = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/oil.csv', parse_dates=['date'])
holidays = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv', parse_dates=['date'])

print(f"  Train: {train.shape}, Test: {test.shape}")

# Feature Engineering
print("\n[2/6] Feature engineering...")

def create_features(df, oil_df, stores_df, holidays_df):
    df = df.copy()

    # Date features
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.dayofweek
    df['dayofyear'] = df['date'].dt.dayofyear
    df['weekofyear'] = df['date'].dt.isocalendar().week.astype(int)
    df['quarter'] = df['date'].dt.quarter
    df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)
    df['is_month_start'] = df['date'].dt.is_month_start.astype(int)
    df['is_month_end'] = df['date'].dt.is_month_end.astype(int)

    # Store info
    df = df.merge(stores_df, on='store_nbr', how='left')

    # Oil prices with interpolation
    oil_df = oil_df.rename(columns={'dcoilwtico': 'oil_price'})
    oil_df['oil_price'] = oil_df['oil_price'].ffill().bfill()
    df = df.merge(oil_df, on='date', how='left')
    df['oil_price'] = df['oil_price'].ffill().bfill()

    # National holidays
    nat_holidays = holidays_df[(holidays_df['locale'] == 'National') &
                               (holidays_df['transferred'] == False)][['date']].drop_duplicates()
    nat_holidays['is_holiday'] = 1
    df = df.merge(nat_holidays, on='date', how='left')
    df['is_holiday'] = df['is_holiday'].fillna(0).astype(int)

    # Encode categoricals
    df['family_encoded'] = df['family'].astype('category').cat.codes
    df['city_encoded'] = df['city'].astype('category').cat.codes
    df['state_encoded'] = df['state'].astype('category').cat.codes
    df['type_encoded'] = df['type'].astype('category').cat.codes

    return df

train = create_features(train, oil, stores, holidays)
test = create_features(test, oil, stores, holidays)
print(f"  Features: {len(train.columns)} columns")

# Lag Features
print("\n[3/6] Creating lag features...")

def add_lag_features(df, lags=[7, 14, 28]):
    df = df.sort_values(['store_nbr', 'family', 'date'])
    for lag in lags:
        df[f'sales_lag_{lag}'] = df.groupby(['store_nbr', 'family'])['sales'].shift(lag)

    # Rolling means
    for window in [7, 14, 28]:
        df[f'sales_rolling_{window}'] = df.groupby(['store_nbr', 'family'])['sales'].transform(
            lambda x: x.shift(1).rolling(window, min_periods=1).mean()
        )
    return df

train = add_lag_features(train)

# For test: use last known values
last_sales = train.groupby(['store_nbr', 'family']).agg({
    'sales': ['last', 'mean']
}).reset_index()
last_sales.columns = ['store_nbr', 'family', 'last_sales', 'avg_sales']

test = test.merge(last_sales, on=['store_nbr', 'family'], how='left')
test['sales_lag_7'] = test['last_sales']
test['sales_lag_14'] = test['last_sales']
test['sales_lag_28'] = test['avg_sales']
test['sales_rolling_7'] = test['avg_sales']
test['sales_rolling_14'] = test['avg_sales']
test['sales_rolling_28'] = test['avg_sales']

print(f"  Lag features added")

# Prepare Data
print("\n[4/6] Preparing model...")

# Use recent data only (from 2015)
train_recent = train[train['date'] >= '2015-01-01'].dropna()

feature_cols = [
    'store_nbr', 'onpromotion',
    'year', 'month', 'day', 'dayofweek', 'dayofyear', 'weekofyear', 'quarter',
    'is_weekend', 'is_month_start', 'is_month_end',
    'cluster', 'oil_price', 'is_holiday',
    'family_encoded', 'city_encoded', 'state_encoded', 'type_encoded',
    'sales_lag_7', 'sales_lag_14', 'sales_lag_28',
    'sales_rolling_7', 'sales_rolling_14', 'sales_rolling_28'
]

X_train = train_recent[feature_cols]
y_train_log = np.log1p(train_recent['sales'])

print(f"  Training samples: {len(X_train)}, Features: {len(feature_cols)}")

# Train Model
print("\n[5/6] Training LightGBM...")

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 127,
    'learning_rate': 0.03,  # Lower LR
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'min_child_samples': 20,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'verbose': -1,
    'n_jobs': -1,
    'seed': 42
}

tscv = TimeSeriesSplit(n_splits=3)
models = []
scores = []

for fold, (train_idx, val_idx) in enumerate(tscv.split(X_train)):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train_log.iloc[train_idx], y_train_log.iloc[val_idx]

    train_data = lgb.Dataset(X_tr, label=y_tr)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

    model = lgb.train(
        params, train_data,
        num_boost_round=2000,  # More rounds
        valid_sets=[val_data],
        callbacks=[lgb.early_stopping(100), lgb.log_evaluation(100)]
    )

    val_pred = np.expm1(model.predict(X_val))
    val_pred = np.maximum(val_pred, 0)
    val_true = np.expm1(y_val)

    rmsle = np.sqrt(mean_squared_log_error(val_true + 1, val_pred + 1))
    scores.append(rmsle)
    models.append(model)
    print(f"  Fold {fold+1} RMSLE: {rmsle:.5f}")

print(f"\n  Mean CV: {np.mean(scores):.5f} (+/- {np.std(scores):.5f})")

# Predictions
print("\n[6/6] Making predictions...")

X_test = test[feature_cols].fillna(0)
predictions = np.zeros(len(X_test))
for model in models:
    predictions += np.expm1(model.predict(X_test)) / len(models)
predictions = np.maximum(predictions, 0)

# Submission
submission = pd.DataFrame({'id': test['id'], 'sales': predictions})
submission.to_csv('submission.csv', index=False)
print(f"  Submission: {len(submission)} rows")
print(f"  Sales range: {predictions.min():.2f} - {predictions.max():.2f}")

# Feature importance
print("\n[Top 10 Features]")
imp = pd.DataFrame({'feature': feature_cols, 'importance': models[0].feature_importance()})
print(imp.nlargest(10, 'importance').to_string(index=False))

print("\n" + "="*70)
print("   DONE!")
print("="*70)
