Python AI 가이드 · 04
AI · ML 라이브러리
scikit-learn으로 클래식 ML을, PyTorch로 딥러닝 기초를 다룹니다. 모델 훈련에서 평가, 저장까지 전체 흐름을 정리합니다.
1. scikit-learn — 머신러닝 기초
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
pipeline = Pipeline([
("scaler", StandardScaler()),
("clf", RandomForestClassifier(n_estimators=200, max_depth=8, random_state=42)),
])
pipeline.fit(X_train, y_train)
cv = cross_val_score(pipeline, X_train, y_train, cv=5, scoring="f1")
print(f"CV F1: {cv.mean():.3f} ± {cv.std():.3f}")
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred, target_names=["하락", "상승"])) # 피처 중요도
clf = pipeline.named_steps["clf"]
feature_names = ["change_rate", "volume_ratio", "buy_pressure", "vwap_dev"]
for i in clf.feature_importances_.argsort()[::-1]:
print(f"{feature_names[i]:<20} {clf.feature_importances_[i]:.4f}") 2. 모델 저장과 불러오기
import joblib, numpy as np
from pathlib import Path
path = Path("models/signal_classifier.joblib")
path.parent.mkdir(exist_ok=True)
joblib.dump(pipeline, path)
loaded = joblib.load(path)
new = np.array([[1.5, 2.3, 0.72, 0.8]])
prob = loaded.predict_proba(new)[0]
label = loaded.predict(new)[0]
print(f"예측: {'상승' if label == 1 else '하락'} (확률: {prob[label]:.2%})") 3. PyTorch — 딥러닝 기초
import torch, torch.nn as nn, torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
dataset = TensorDataset(torch.FloatTensor(X_train), torch.LongTensor(y_train))
loader = DataLoader(dataset, batch_size=64, shuffle=True)
class SignalNet(nn.Module):
def __init__(self, n: int):
super().__init__()
self.net = nn.Sequential(
nn.Linear(n, 64), nn.ReLU(), nn.Dropout(0.3),
nn.Linear(64, 32), nn.ReLU(),
nn.Linear(32, 2),
)
def forward(self, x): return self.net(x)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SignalNet(X_train.shape[1]).to(device)
opt = optim.AdamW(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()
for epoch in range(50):
model.train()
total = sum(
(opt.zero_grad() or loss_fn(model(xb.to(device)), yb.to(device)))
.backward() or loss_fn(model(xb.to(device)), yb.to(device)).item()
for xb, yb in loader
)
if (epoch + 1) % 10 == 0:
print(f"Epoch {epoch+1:3d} done") # 평가 및 저장
model.eval()
with torch.no_grad():
preds = model(torch.FloatTensor(X_test).to(device)).argmax(1).cpu().numpy()
from sklearn.metrics import accuracy_score, f1_score
print(f"Accuracy: {accuracy_score(y_test, preds):.4f}")
print(f"F1: {f1_score(y_test, preds):.4f}")
torch.save(model.state_dict(), "models/signal_net.pt") 4. 평가 지표 선택 기준
| 지표 | 적합한 상황 | 주의 |
|---|---|---|
| Accuracy | 클래스 균형이 잡혀 있을 때 | 불균형 데이터에서 오해 가능 |
| Precision | False Positive 비용이 높을 때 | 재현율과 트레이드오프 |
| Recall | False Negative 비용이 높을 때 | 정밀도와 트레이드오프 |
| F1 Score | 정밀도·재현율 균형이 필요할 때 | 불균형 시 가중 F1 사용 |
| ROC-AUC | 임계값 독립적 성능 비교 | 불균형 시 PR-AUC가 더 적합 |