"""
Store Sales V4 - Optimized LightGBM
Based on V1 (CV 0.347) with winning techniques
"""

import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit
import warnings
warnings.filterwarnings('ignore')

print("="*70)
print("   Store Sales V4 - Optimized")
print("="*70)

# Load data
PATH = "/kaggle/input/store-sales-time-series-forecasting"
train = pd.read_csv(f"{PATH}/train.csv", parse_dates=['date'])
test = pd.read_csv(f"{PATH}/test.csv", parse_dates=['date'])
oil = pd.read_csv(f"{PATH}/oil.csv", parse_dates=['date'])
stores = pd.read_csv(f"{PATH}/stores.csv")
holidays = pd.read_csv(f"{PATH}/holidays_events.csv", parse_dates=['date'])

print(f"Train: {train.shape}, Test: {test.shape}")

# Oil interpolation
oil = oil.rename(columns={'dcoilwtico': 'oil_price'})
date_range = pd.date_range(train['date'].min(), test['date'].max())
oil_full = pd.DataFrame({'date': date_range}).merge(oil, on='date', how='left')
oil_full['oil_price'] = oil_full['oil_price'].interpolate(method='linear', limit_direction='both')
oil_full['oil_price'] = oil_full['oil_price'].fillna(oil_full['oil_price'].mean())

# National holidays
holidays_nat = holidays[holidays['locale'] == 'National'][['date']].drop_duplicates()
holidays_nat['is_holiday'] = 1

# Combine data
train['is_train'] = 1
test['is_train'] = 0
test['sales'] = np.nan
df = pd.concat([train, test], ignore_index=True)

# Merges
df = df.merge(stores, on='store_nbr', how='left')
df = df.merge(oil_full, on='date', how='left')
df = df.merge(holidays_nat, on='date', how='left')
df['is_holiday'] = df['is_holiday'].fillna(0)

# Date features
df['dayofweek'] = df['date'].dt.dayofweek
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['dayofyear'] = df['date'].dt.dayofyear
df['weekofyear'] = df['date'].dt.isocalendar().week.astype(int)
df['year'] = df['date'].dt.year
df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)
df['is_month_end'] = (df['day'] >= 28).astype(int)
df['is_payday'] = ((df['day'] == 15) | (df['day'] >= 28)).astype(int)

# Encode categoricals
df['family_encoded'] = df['family'].astype('category').cat.codes
df['store_type_encoded'] = df['type'].astype('category').cat.codes
df['city_encoded'] = df['city'].astype('category').cat.codes
df['cluster'] = df['cluster'].fillna(0)

# Sort and create lags
df = df.sort_values(['store_nbr', 'family', 'date']).reset_index(drop=True)

# Lag features per group
print("Creating lag features...")
for lag in [7, 14, 28]:
    df[f'sales_lag_{lag}'] = df.groupby(['store_nbr', 'family'])['sales'].shift(lag)

for window in [7, 14, 28]:
    df[f'sales_rolling_{window}'] = df.groupby(['store_nbr', 'family'])['sales'].transform(
        lambda x: x.shift(1).rolling(window, min_periods=1).mean()
    )

# Zero detection (21-day window)
df['zero_21'] = df.groupby(['store_nbr', 'family'])['sales'].transform(
    lambda x: x.shift(1).rolling(21, min_periods=21).apply(lambda y: (y == 0).all())
)
df['zero_21'] = df['zero_21'].fillna(0)

# Fill NaN
lag_cols = [c for c in df.columns if 'lag_' in c or 'rolling_' in c]
df[lag_cols] = df[lag_cols].fillna(0)

# Feature columns
exclude = ['id', 'date', 'sales', 'is_train', 'family', 'city', 'state', 'type']
features = [c for c in df.columns if c not in exclude]

# Prepare train/test
train_mask = (df['is_train'] == 1) & (df['date'] >= '2013-03-01')
train_df = df[train_mask].copy()
test_df = df[df['is_train'] == 0].copy()

# Remove all-zero series from training
train_df = train_df[train_df['zero_21'] != 1]

X_train = train_df[features]
y_train = np.log1p(np.maximum(train_df['sales'].values, 0))
X_test = test_df[features]

print(f"Training: {len(X_train)}, Features: {len(features)}")

# LightGBM params (from V1 working config)
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 255,
    'max_depth': 12,
    'min_child_samples': 30,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'n_estimators': 2000,
    'verbose': -1,
    'random_state': 42,
    'n_jobs': -1
}

# CV Training
print("\nTraining...")
tscv = TimeSeriesSplit(n_splits=3)
models = []
cv_scores = []

for fold, (tr_idx, val_idx) in enumerate(tscv.split(X_train), 1):
    X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train[tr_idx], y_train[val_idx]

    model = lgb.LGBMRegressor(**params)
    model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)],
              callbacks=[lgb.early_stopping(50, verbose=False), lgb.log_evaluation(100)])

    pred = np.expm1(model.predict(X_val))
    pred = np.maximum(pred, 0)
    actual = np.expm1(y_val)

    rmsle = np.sqrt(np.mean((np.log1p(pred) - np.log1p(actual))**2))
    cv_scores.append(rmsle)
    models.append(model)
    print(f"  Fold {fold}: {rmsle:.5f}")

print(f"\nMean CV: {np.mean(cv_scores):.5f} (+/- {np.std(cv_scores):.5f})")

# Predictions
print("\nPredicting...")
preds = np.zeros(len(X_test))
for m in models:
    preds += m.predict(X_test)
preds = np.expm1(preds / len(models))
preds = np.maximum(preds, 0)

# Zero forecasting
zero_mask = test_df['zero_21'].values == 1
preds[zero_mask] = 0
print(f"Zero-forecasted: {zero_mask.sum()}")

# Submission
submission = pd.DataFrame({'id': test_df['id'].astype(int), 'sales': preds})
submission.to_csv('submission.csv', index=False)
print(f"\nSubmission: {len(submission)} rows")
print(f"Sales range: {preds.min():.2f} - {preds.max():.2f}")

# Feature importance
print("\n[Top 10 Features]")
imp = pd.DataFrame({'feature': features, 'importance': np.mean([m.feature_importances_ for m in models], axis=0)})
print(imp.nlargest(10, 'importance').to_string(index=False))

print("\n" + "="*70)
print("   DONE!")
print("="*70)
