"""
Store Sales Time Series Forecasting V3 - Multi-Lag Ensemble
============================================================
Competition: https://www.kaggle.com/competitions/store-sales-time-series-forecasting

Key techniques from top notebooks:
1. Multi-lag ensemble: 7, 63, 365, 730 days
2. Zero forecasting with 21-day window
3. Advanced holiday features
4. LightGBM with Tweedie loss
5. Store/family-level features
"""

import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit
from scipy.stats import pearsonr
import warnings
warnings.filterwarnings('ignore')

print("="*70)
print("   Store Sales V3 - Multi-Lag Ensemble")
print("   Kaggle Competition - vincenzorubino")
print("="*70)
print()

# ==============================================================================
# 1. LOAD DATA
# ==============================================================================
print("[1/7] Loading data...")
PATH = "/kaggle/input/store-sales-time-series-forecasting"

train = pd.read_csv(f"{PATH}/train.csv", parse_dates=['date'])
test = pd.read_csv(f"{PATH}/test.csv", parse_dates=['date'])
oil = pd.read_csv(f"{PATH}/oil.csv", parse_dates=['date'])
stores = pd.read_csv(f"{PATH}/stores.csv")
holidays = pd.read_csv(f"{PATH}/holidays_events.csv", parse_dates=['date'])
transactions = pd.read_csv(f"{PATH}/transactions.csv", parse_dates=['date'])

print(f"  Train shape: {train.shape}")
print(f"  Test shape: {test.shape}")
print()

# ==============================================================================
# 2. PREPROCESS OIL DATA
# ==============================================================================
print("[2/7] Processing oil data...")
oil = oil.rename(columns={'dcoilwtico': 'oil_price'})

# Create complete date range
date_range = pd.date_range(train['date'].min(), test['date'].max())
oil_full = pd.DataFrame({'date': date_range})
oil_full = oil_full.merge(oil, on='date', how='left')

# Linear interpolation
oil_full['oil_price'] = oil_full['oil_price'].interpolate(method='linear', limit_direction='both')
oil_full['oil_price'] = oil_full['oil_price'].fillna(oil_full['oil_price'].mean())

print(f"  Oil data: {len(oil_full)} records")
print()

# ==============================================================================
# 3. PROCESS HOLIDAYS
# ==============================================================================
print("[3/7] Processing holidays...")

# Remove transferred holidays
holidays = holidays[holidays['transferred'] == False].copy()

# Create holiday categories
holidays['is_national'] = (holidays['locale'] == 'National').astype(int)
holidays['is_regional'] = (holidays['locale'] == 'Regional').astype(int)
holidays['is_local'] = (holidays['locale'] == 'Local').astype(int)

# Key national holidays
key_holidays = ['Navidad', 'Ano Nuevo', 'Dia de la Madre', 'Dia del Trabajo',
                'Primer Grito', 'Viernes Santo', 'Terremoto', 'Futbol']
for holiday in key_holidays:
    holidays[f'is_{holiday.lower().replace(" ", "_")}'] = holidays['description'].str.contains(holiday, case=False, na=False).astype(int)

# Aggregate by date
holiday_agg = holidays.groupby('date').agg({
    'is_national': 'max',
    'is_regional': 'max',
    'is_local': 'max',
    **{f'is_{h.lower().replace(" ", "_")}': 'max' for h in key_holidays}
}).reset_index()

print(f"  Holiday features: {len(holiday_agg.columns)-1}")
print()

# ==============================================================================
# 4. MERGE AND FEATURE ENGINEERING
# ==============================================================================
print("[4/7] Feature engineering...")

# Combine train and test
train['is_train'] = 1
test['is_train'] = 0
test['sales'] = np.nan
df = pd.concat([train, test], ignore_index=True)

# Merge stores
df = df.merge(stores, on='store_nbr', how='left')

# Merge oil
df = df.merge(oil_full, on='date', how='left')

# Merge holidays
df = df.merge(holiday_agg, on='date', how='left')
holiday_cols = [c for c in df.columns if c.startswith('is_')]
df[holiday_cols] = df[holiday_cols].fillna(0)

# Date features
df['dayofweek'] = df['date'].dt.dayofweek
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['dayofyear'] = df['date'].dt.dayofyear
df['weekofyear'] = df['date'].dt.isocalendar().week.astype(int)
df['year'] = df['date'].dt.year
df['quarter'] = df['date'].dt.quarter

# Weekend/payday features
df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)
df['is_month_start'] = (df['day'] <= 3).astype(int)
df['is_month_end'] = (df['day'] >= 28).astype(int)
df['is_payday'] = ((df['day'] == 15) | (df['day'] >= 28)).astype(int)

# Cyclic encoding for seasonality
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
df['dayofweek_sin'] = np.sin(2 * np.pi * df['dayofweek'] / 7)
df['dayofweek_cos'] = np.cos(2 * np.pi * df['dayofweek'] / 7)
df['dayofyear_sin'] = np.sin(2 * np.pi * df['dayofyear'] / 365)
df['dayofyear_cos'] = np.cos(2 * np.pi * df['dayofyear'] / 365)

# Encode categoricals
df['family_encoded'] = df['family'].astype('category').cat.codes
df['store_type_encoded'] = df['type'].astype('category').cat.codes
df['city_encoded'] = df['city'].astype('category').cat.codes
df['state_encoded'] = df['state'].astype('category').cat.codes

# Merge transactions
df = df.merge(transactions, on=['date', 'store_nbr'], how='left')
df['transactions'] = df['transactions'].fillna(0)

print(f"  Total features before lags: {len(df.columns)}")
print()

# ==============================================================================
# 5. CREATE LAG FEATURES (MULTI-LAG APPROACH)
# ==============================================================================
print("[5/7] Creating multi-lag features...")

# Sort by store, family, date
df = df.sort_values(['store_nbr', 'family', 'date']).reset_index(drop=True)

# Create group key
df['group_key'] = df['store_nbr'].astype(str) + '_' + df['family'].astype(str)

# Multi-lag approach from top notebooks: 7, 14, 21, 28, 63, 365 days
lag_days = [7, 14, 21, 28, 63]  # Skip 365 for memory
window_sizes = [7, 14, 21, 28]

# Function to create lag features for a group
def create_lag_features(group):
    for lag in lag_days:
        group[f'sales_lag_{lag}'] = group['sales'].shift(lag)

    for window in window_sizes:
        group[f'sales_rolling_mean_{window}'] = group['sales'].shift(1).rolling(window=window, min_periods=1).mean()
        group[f'sales_rolling_std_{window}'] = group['sales'].shift(1).rolling(window=window, min_periods=1).std()
        group[f'sales_ewm_{window}'] = group['sales'].shift(1).ewm(span=window, min_periods=1).mean()

    # Zero detection (21-day window from top notebook)
    group['zero_count_21'] = group['sales'].shift(1).rolling(window=21, min_periods=1).apply(lambda x: (x == 0).sum())
    group['all_zero_21'] = (group['zero_count_21'] >= 21).astype(int)

    return group

# Apply lag features
df = df.groupby('group_key', group_keys=False).apply(create_lag_features)

# Fill NaN lags with 0
lag_cols = [c for c in df.columns if 'lag_' in c or 'rolling_' in c or 'ewm_' in c]
df[lag_cols] = df[lag_cols].fillna(0)

print(f"  Lag columns created: {len(lag_cols)}")
print()

# ==============================================================================
# 6. PREPARE TRAINING DATA
# ==============================================================================
print("[6/7] Preparing training data...")

# Exclude features
exclude_cols = ['id', 'date', 'sales', 'is_train', 'family', 'city', 'state',
                'type', 'group_key']

feature_cols = [c for c in df.columns if c not in exclude_cols]

# Training data (exclude first 63 days for lag features)
train_mask = (df['is_train'] == 1) & (df['date'] >= df['date'].min() + pd.Timedelta(days=63))
train_df = df[train_mask].copy()
test_df = df[df['is_train'] == 0].copy()

# Remove samples with zero count > 21 from training (will be zero-predicted)
train_df_filtered = train_df[train_df['all_zero_21'] == 0]

X_train = train_df_filtered[feature_cols]
y_train = np.log1p(train_df_filtered['sales'].clip(min=0))

X_test = test_df[feature_cols]

print(f"  Training samples: {len(X_train)}")
print(f"  Features: {len(feature_cols)}")
print(f"  Test samples: {len(X_test)}")
print()

# ==============================================================================
# 7. TRAIN LIGHTGBM MODEL
# ==============================================================================
print("[7/7] Training LightGBM ensemble...")

# LightGBM parameters optimized for time series
lgb_params = {
    'objective': 'tweedie',
    'tweedie_variance_power': 1.5,
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.03,
    'num_leaves': 127,
    'max_depth': 10,
    'min_child_samples': 50,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'lambda_l1': 0.1,
    'lambda_l2': 1.0,
    'n_estimators': 2000,
    'early_stopping_rounds': 100,
    'verbose': -1,
    'random_state': 42,
    'n_jobs': -1
}

# Cross-validation with TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=3)
cv_scores = []
models = []

for fold, (train_idx, val_idx) in enumerate(tscv.split(X_train), 1):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model = lgb.LGBMRegressor(**lgb_params)
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        callbacks=[lgb.early_stopping(100, verbose=False), lgb.log_evaluation(200)]
    )

    # Predict and calculate RMSLE
    val_pred = model.predict(X_val)
    val_pred = np.expm1(val_pred).clip(min=0)
    y_val_actual = np.expm1(y_val)

    rmsle = np.sqrt(np.mean((np.log1p(val_pred) - np.log1p(y_val_actual))**2))
    cv_scores.append(rmsle)
    models.append(model)

    print(f"  Fold {fold} RMSLE: {rmsle:.5f}")

print()
print(f"  Mean CV RMSLE: {np.mean(cv_scores):.5f} (+/- {np.std(cv_scores):.5f})")
print()

# ==============================================================================
# 8. MAKE PREDICTIONS
# ==============================================================================
print("[8/8] Making predictions...")

# Ensemble predictions (average all folds)
preds = np.zeros(len(X_test))
for model in models:
    preds += model.predict(X_test)
preds /= len(models)

# Convert back from log scale
preds = np.expm1(preds).clip(min=0)

# Apply zero forecasting
zero_mask = test_df['all_zero_21'] == 1
preds[zero_mask.values] = 0
print(f"  Zero-forecasted samples: {zero_mask.sum()}")

# Create submission
submission = pd.DataFrame({
    'id': test_df['id'].astype(int),
    'sales': preds
})

submission.to_csv('submission.csv', index=False)
print(f"  Submission saved: {len(submission)} predictions")
print(f"  Sales range: {preds.min():.2f} - {preds.max():.2f}")
print()

# Feature importance
print("[Feature Importance - Top 15]")
importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': np.mean([m.feature_importances_ for m in models], axis=0)
}).sort_values('importance', ascending=False)
print(importance.head(15).to_string(index=False))
print()

print("="*70)
print("   DONE! submission.csv ready for upload")
print("="*70)
