Python AI 가이드 · 04

AI · ML 라이브러리

scikit-learn으로 클래식 ML을, PyTorch로 딥러닝 기초를 다룹니다. 모델 훈련에서 평가, 저장까지 전체 흐름을 정리합니다.

1. scikit-learn — 머신러닝 기초

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", RandomForestClassifier(n_estimators=200, max_depth=8, random_state=42)),
])
pipeline.fit(X_train, y_train)

cv = cross_val_score(pipeline, X_train, y_train, cv=5, scoring="f1")
print(f"CV F1: {cv.mean():.3f} ± {cv.std():.3f}")

y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred, target_names=["하락", "상승"]))

# 피처 중요도

clf = pipeline.named_steps["clf"]
feature_names = ["change_rate", "volume_ratio", "buy_pressure", "vwap_dev"]
for i in clf.feature_importances_.argsort()[::-1]:
    print(f"{feature_names[i]:<20} {clf.feature_importances_[i]:.4f}")

2. 모델 저장과 불러오기

import joblib, numpy as np
from pathlib import Path

path = Path("models/signal_classifier.joblib")
path.parent.mkdir(exist_ok=True)
joblib.dump(pipeline, path)

loaded = joblib.load(path)
new    = np.array([[1.5, 2.3, 0.72, 0.8]])
prob   = loaded.predict_proba(new)[0]
label  = loaded.predict(new)[0]
print(f"예측: {'상승' if label == 1 else '하락'} (확률: {prob[label]:.2%})")

3. PyTorch — 딥러닝 기초

import torch, torch.nn as nn, torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

dataset = TensorDataset(torch.FloatTensor(X_train), torch.LongTensor(y_train))
loader  = DataLoader(dataset, batch_size=64, shuffle=True)

class SignalNet(nn.Module):
    def __init__(self, n: int):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n, 64), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(64, 32), nn.ReLU(),
            nn.Linear(32, 2),
        )
    def forward(self, x): return self.net(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model  = SignalNet(X_train.shape[1]).to(device)
opt    = optim.AdamW(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

for epoch in range(50):
    model.train()
    total = sum(
        (opt.zero_grad() or loss_fn(model(xb.to(device)), yb.to(device)))
        .backward() or loss_fn(model(xb.to(device)), yb.to(device)).item()
        for xb, yb in loader
    )
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1:3d} done")

# 평가 및 저장

model.eval()
with torch.no_grad():
    preds = model(torch.FloatTensor(X_test).to(device)).argmax(1).cpu().numpy()

from sklearn.metrics import accuracy_score, f1_score
print(f"Accuracy: {accuracy_score(y_test, preds):.4f}")
print(f"F1:       {f1_score(y_test, preds):.4f}")

torch.save(model.state_dict(), "models/signal_net.pt")

4. 평가 지표 선택 기준

지표	적합한 상황	주의
Accuracy	클래스 균형이 잡혀 있을 때	불균형 데이터에서 오해 가능
Precision	False Positive 비용이 높을 때	재현율과 트레이드오프
Recall	False Negative 비용이 높을 때	정밀도와 트레이드오프
F1 Score	정밀도·재현율 균형이 필요할 때	불균형 시 가중 F1 사용
ROC-AUC	임계값 독립적 성능 비교	불균형 시 PR-AUC가 더 적합

← 03 데이터 처리 05 LLM 개발 →