์์ค ์ฝ๋ ์ค๋ช
ํ์ผ ๊ตฌ์กฐ
src/
โโโ predictor.py # ์์ธก ํด๋์ค
โโโ feature_engineering.py # ํน์ง ์์ฑ
โโโ train.py # ํ์ต ์คํฌ๋ฆฝํธ
โโโ README.md # ์ด ํ์ผ
๊ฐ ํ์ผ ์ค๋ช
1. predictor.py - ์์ธก ํด๋์ค
์ฉ๋: ํ์ต๋ ๋ชจ๋ธ์ ๋ก๋ํ๊ณ ์์ธก์ ์ํํ๋ ๋ฉ์ธ ํด๋์ค
์ฃผ์ ํด๋์ค: EarlyWarningPredictor
์ฃผ์ ๋ฉ์๋:
# ๋ชจ๋ธ ๋ก๋ (ํ๊น
ํ์ด์ค ์คํ์ผ)
model = EarlyWarningPredictor.from_pretrained("models/")
# ๋จ์ผ ์์ธก
result = model.predict(store_data)
# ๋ฐฐ์น ์์ธก
results = model.predict_batch(stores_df)
# ์์ธก ์ค๋ช
explanation = model.explain(store_data)
# ๋ชจ๋ธ ์ ๋ณด
info = model.get_model_info()
๋ฐํ ๊ฐ:
{
'risk_score': 78.5, # 0-100์ ์ํ๋
'risk_level': '๋์', # ๋ฎ์/๋ณดํต/๋์
'closure_probability': 0.785, # ํ์
ํ๋ฅ
'risk_factors': {...}, # ์ํ ์์ธ๋ณ ์ ์
'action_items': [...] # ๊ถ์ฅ ์กฐ์น
}
์์ ๋ฐฉ๋ฒ:
# 1. ์ํ๋ ์๊ณ๊ฐ ๋ณ๊ฒฝ
def predict(self, store_data, threshold=0.5): # ๊ธฐ๋ณธ๊ฐ ๋ณ๊ฒฝ
...
# 2. ์์๋ธ ๊ฐ์ค์น ์กฐ์
# models/config.json ํ์ผ์์:
{
"ensemble_weights": [0.6, 0.4] # XGBoost 60%, LightGBM 40%
}
# 3. ์ํ ๋ฑ๊ธ ๊ธฐ์ค ๋ณ๊ฒฝ
if risk_score < 40: # ๊ธฐ์กด 30์์ 40์ผ๋ก
risk_level = '๋ฎ์'
2. feature_engineering.py - ํน์ง ์์ฑ
์ฉ๋: ์๋ณธ ๋ฐ์ดํฐ์์ 47๊ฐ์ ํน์ง์ ์๋์ผ๋ก ์์ฑ
์ฃผ์ ํด๋์ค: FeatureEngineer
์์ฑ๋๋ ํน์ง:
๋งค์ถ ๊ด๋ จ (15๊ฐ)
sales_avg_1m,sales_avg_3m,sales_avg_6m,sales_avg_12msales_recent_vs_previous,sales_mom_change,sales_yoy_changesales_max,sales_min,sales_range
๊ณ ๊ฐ ๊ด๋ จ (12๊ฐ)
customer_reuse_rate,customer_reuse_trendcustomer_new_rate- ์ฐ๋ น/์ฑ๋ณ๋ณ ๊ณ ๊ฐ ๋น์จ (10๊ฐ)
์ด์ ๊ด๋ จ (8๊ฐ)
operation_months,operation_avg_amountoperation_cancel_rate,operation_delivery_rate
ํธ๋ ๋ (5๊ฐ)
trend_slope,trend_r2,trend_directiontrend_consecutive_down,trend_consecutive_up
๋ณ๋์ฑ (4๊ฐ)
volatility_cv,volatility_std,volatility_mad,volatility_recent_std
๊ณ์ ์ฑ (2๊ฐ)
seasonality_detected,seasonality_strength
๋งฅ๋ฝ (1๊ฐ)
context_industry
์ฌ์ฉ ์์:
from feature_engineering import FeatureEngineer
engineer = FeatureEngineer()
features = engineer.create_features(
store_data={'industry': '์นดํ', 'location': '์์ธ'},
monthly_usage=usage_df,
monthly_customers=customer_df
)
์๋ก์ด ํน์ง ์ถ๊ฐ ๋ฐฉ๋ฒ:
class FeatureEngineer:
def _create_custom_features(self, df):
"""์ปค์คํ
ํน์ง ์ถ๊ฐ"""
features = {}
# ์: ์ฑ์ฅ๋ฅ ์งํ
if 'RC_M1_SAA' in df.columns and len(df) >= 6:
recent_3m = df['RC_M1_SAA'].tail(3).mean()
past_3m = df['RC_M1_SAA'].head(3).mean()
features['growth_rate'] = (recent_3m / past_3m - 1) * 100
return features
def create_features(self, store_data, monthly_usage, monthly_customers):
features = {}
# ๊ธฐ์กด ํน์ง๋ค...
features.update(self._create_sales_features(monthly_usage))
features.update(self._create_customer_features(monthly_customers))
# ์๋ก์ด ์ปค์คํ
ํน์ง ์ถ๊ฐ
features.update(self._create_custom_features(monthly_usage))
return pd.DataFrame([features])
3. train.py - ํ์ต ์คํฌ๋ฆฝํธ
์ฉ๋: ์ปค๋งจ๋๋ผ์ธ์์ ๋ชจ๋ธ์ ํ์ตํ๋ ์คํฌ๋ฆฝํธ
์ฌ์ฉ๋ฒ:
# ๊ธฐ๋ณธ ์ฌ์ฉ
python src/train.py
# ์ต์
์ง์
python src/train.py --data data/raw --output models/ --max-stores 1000
# ๋์๋ง
python src/train.py --help
ํ๋ผ๋ฏธํฐ:
--data: ๋ฐ์ดํฐ ๋๋ ํ ๋ฆฌ ๊ฒฝ๋ก (๊ธฐ๋ณธ:data/raw)--output: ๋ชจ๋ธ ์ ์ฅ ๊ฒฝ๋ก (๊ธฐ๋ณธ:models)--max-stores: ํ ์คํธ์ฉ ์ต๋ ๋งค์ฅ ์ (์ ํ์ฌํญ)
์ฃผ์ ํจ์:
def load_data(data_dir)
"""๋ฐ์ดํฐ ๋ก๋"""
def create_features(df_store, df_usage, df_customer)
"""ํน์ง ์์ฑ"""
def preprocess_data(X, y)
"""์ ์ฒ๋ฆฌ ๋ฐ ๋ถํ """
def apply_smote(X_train, y_train)
"""SMOTE ์ ์ฉ"""
def train_models(X_train, y_train)
"""๋ชจ๋ธ ํ์ต"""
def evaluate_models(xgb_model, lgb_model, X_test, y_test)
"""ํ๊ฐ"""
def save_models(...)
"""๋ชจ๋ธ ์ ์ฅ"""
์์ ๋ฐฉ๋ฒ:
# 1. ๋ชจ๋ธ ํ์ดํผํ๋ผ๋ฏธํฐ ๋ณ๊ฒฝ
def train_models(X_train, y_train):
xgb_model = xgb.XGBClassifier(
max_depth=8, # 6์์ 8๋ก ์ฆ๊ฐ
learning_rate=0.05, # 0.1์์ 0.05๋ก ๊ฐ์
n_estimators=300, # 200์์ 300์ผ๋ก ์ฆ๊ฐ
# ...
)
# 2. ์์๋ธ ๊ฐ์ค์น ๋ณ๊ฒฝ
def evaluate_models(...):
ensemble_pred = 0.6 * xgb_pred + 0.4 * lgb_pred # ๊ธฐ์กด 0.5, 0.5
# 3. ๋ฐ์ดํฐ ๋ถํ ๋น์จ ๋ณ๊ฒฝ
def preprocess_data(X, y):
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, ... # 0.25์์ 0.2๋ก
)
์ฃผ์ ์์ ์๋๋ฆฌ์ค
์๋๋ฆฌ์ค 1: ์๋ก์ด ๋ฐ์ดํฐ๋ก ํ์ต
1๋จ๊ณ: ๋ฐ์ดํฐ ์ค๋น
# data/raw/์ CSV ํ์ผ 3๊ฐ ๋ฐฐ์น
data/raw/
โโโ big_data_set1_f.csv
โโโ ds2_monthly_usage.csv
โโโ ds3_monthly_customers.csv
2๋จ๊ณ: ํ์ต ์คํ
python src/train.py
3๋จ๊ณ: ์์ธก ์ฌ์ฉ
from src.predictor import EarlyWarningPredictor
model = EarlyWarningPredictor.from_pretrained("models/")
์๋๋ฆฌ์ค 2: ๋ชจ๋ธ ์ฑ๋ฅ ๊ฐ์
๋ฐฉ๋ฒ 1: ํน์ง ์ถ๊ฐ
# feature_engineering.py์ ์๋ก์ด ํน์ง ์ถ๊ฐ
def _create_custom_features(self, df):
# ์๋ก์ด ์งํ ๊ณ์ฐ
pass
๋ฐฉ๋ฒ 2: ํ์ดํผํ๋ผ๋ฏธํฐ ํ๋
# train.py์์ ํ๋ผ๋ฏธํฐ ์กฐ์
xgb_model = xgb.XGBClassifier(
max_depth=8,
learning_rate=0.05,
...
)
๋ฐฉ๋ฒ 3: ์์๋ธ ๊ฐ์ค์น ์กฐ์
# models/config.json ์์
{
"ensemble_weights": [0.6, 0.4]
}
์๋๋ฆฌ์ค 3: ์์ธก ์๊ณ๊ฐ ์กฐ์
๋ ๋ฏผ๊ฐํ๊ฒ (์กฐ๊ธฐ ๊ฒฝ๋ณด ๊ฐํ):
result = model.predict(store_data, threshold=0.3)
# ํ์
ํ๋ฅ 30% ์ด์์ด๋ฉด ์ํ์ผ๋ก ํ๋จ
๋ ๋ณด์์ ์ผ๋ก:
result = model.predict(store_data, threshold=0.7)
# ํ์
ํ๋ฅ 70% ์ด์์ด์ด์ผ ์ํ์ผ๋ก ํ๋จ
์ฐธ๊ณ ์๋ฃ
- XGBoost ๋ฌธ์: https://xgboost.readthedocs.io/
- LightGBM ๋ฌธ์: https://lightgbm.readthedocs.io/
- SMOTE ์ค๋ช : https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.SMOTE.html