이상 탐지 프로젝트 - 배움 에이아이

극심한 클래스 불균형(정상 99% vs 사기 1%) 상황에서 이상 거래를 탐지하는 프로젝트입니다. 비지도 이상 탐지 모델과 지도학습 모델을 모두 적용하고, 불균형 처리 기법의 효과를 비교합니다.

프로젝트 개요

항목	내용
문제 유형	이진 분류 (사기 탐지) + 이상 탐지
데이터셋	합성 신용카드 거래 데이터
핵심 기법	불균형 처리, 비지도 이상 탐지, 평가 지표 선택
사용 알고리즘	Isolation Forest, LOF, SMOTE, XGBoost
난이도	중급

프로젝트 실습

불균형 데이터 생성 및 분석

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification

# 극심한 불균형 데이터 생성 (정상 98% vs 사기 2%)
X, y = make_classification(
    n_samples=10000,
    n_features=20,
    n_informative=10,
    n_redundant=5,
    n_classes=2,
    weights=[0.98, 0.02],  # 불균형 비율
    flip_y=0.01,
    random_state=42,
)

df = pd.DataFrame(X, columns=[f"V{i+1}" for i in range(X.shape[1])])
df["is_fraud"] = y

print(f"데이터 크기: {df.shape}")
print(f"\n클래스 분포:")
print(df["is_fraud"].value_counts())
print(f"\n불균형 비율: {df['is_fraud'].mean():.2%}")

# 분포 시각화
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
df["is_fraud"].value_counts().plot(kind="bar", ax=axes[0])
axes[0].set_title("클래스 분포")
axes[0].set_xticklabels(["정상", "사기"], rotation=0)

# 주요 변수 분포 비교
for label, color in [(0, "blue"), (1, "red")]:
    subset = df[df["is_fraud"] == label]
    axes[1].hist(subset["V1"], bins=50, alpha=0.5,
                 color=color, label="정상" if label == 0 else "사기")
axes[1].set_title("V1 변수 분포 비교")
axes[1].legend()
plt.tight_layout()
plt.show()

비지도 이상 탐지

from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 학습/테스트 분할
X = df.drop(columns=["is_fraud"])
y = df["is_fraud"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 1. Isolation Forest
iso_forest = IsolationForest(
    n_estimators=200,
    contamination=0.02,    # 예상 이상치 비율
    random_state=42,
    n_jobs=-1,
)
iso_pred = iso_forest.fit_predict(X_test_scaled)
# -1: 이상, 1: 정상 → 0: 정상, 1: 이상으로 변환
iso_labels = (iso_pred == -1).astype(int)

print("=== Isolation Forest ===")
print(classification_report(y_test, iso_labels, target_names=["정상", "사기"]))

# 2. Local Outlier Factor
lof = LocalOutlierFactor(
    n_neighbors=20,
    contamination=0.02,
    novelty=False,
)
lof_pred = lof.fit_predict(X_test_scaled)
lof_labels = (lof_pred == -1).astype(int)

print("=== Local Outlier Factor ===")
print(classification_report(y_test, lof_labels, target_names=["정상", "사기"]))

불균형 처리 기법 비교

from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# 불균형 처리 전략 비교
strategies = {
    "처리 없음": None,
    "SMOTE": SMOTE(random_state=42),
    "ADASYN": ADASYN(random_state=42),
    "언더샘플링": RandomUnderSampler(random_state=42),
    "SMOTE + Tomek": SMOTETomek(random_state=42),
}

results = {}
for name, sampler in strategies.items():
    if sampler is None:
        pipe = ImbPipeline([
            ("scaler", StandardScaler()),
            ("model", RandomForestClassifier(n_estimators=100, random_state=42)),
        ])
    else:
        pipe = ImbPipeline([
            ("scaler", StandardScaler()),
            ("sampler", sampler),
            ("model", RandomForestClassifier(n_estimators=100, random_state=42)),
        ])

    # F1 점수로 평가 (불균형 데이터에서 accuracy는 부적절)
    f1_scores = cross_val_score(
        pipe, X_train, y_train, cv=5, scoring="f1"
    )
    pr_auc_scores = cross_val_score(
        pipe, X_train, y_train, cv=5, scoring="average_precision"
    )

    results[name] = {
        "F1": f"{f1_scores.mean():.4f} (+/- {f1_scores.std():.4f})",
        "PR-AUC": f"{pr_auc_scores.mean():.4f} (+/- {pr_auc_scores.std():.4f})",
    }

print(pd.DataFrame(results).T)

최적 모델 구축

from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# XGBoost + SMOTE 파이프라인
pipe = ImbPipeline([
    ("scaler", StandardScaler()),
    ("sampler", SMOTE(random_state=42)),
    ("model", XGBClassifier(
        random_state=42, eval_metric="logloss",
        scale_pos_weight=1,  # SMOTE 후에는 1로 설정
    )),
])

param_dist = {
    "model__n_estimators": randint(100, 500),
    "model__max_depth": randint(3, 10),
    "model__learning_rate": uniform(0.01, 0.2),
    "model__subsample": uniform(0.6, 0.4),
    "model__colsample_bytree": uniform(0.6, 0.4),
    "sampler__k_neighbors": randint(3, 10),
}

search = RandomizedSearchCV(
    pipe, param_dist,
    n_iter=30, cv=5,
    scoring="f1",       # 불균형 데이터에서는 F1 사용
    random_state=42, n_jobs=-1,
)
search.fit(X_train, y_train)

print(f"최적 F1: {search.best_score_:.4f}")
print(f"최적 파라미터: {search.best_params_}")

최종 평가

from sklearn.metrics import (
    classification_report, confusion_matrix,
    precision_recall_curve, roc_curve, auc
)

# 테스트 세트 평가
y_pred = search.predict(X_test)
y_proba = search.predict_proba(X_test)[:, 1]

# 분류 리포트
print("=== 최종 분류 리포트 ===")
print(classification_report(y_test, y_pred, target_names=["정상", "사기"]))

# 혼동 행렬
cm = confusion_matrix(y_test, y_pred)
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=axes[0],
            xticklabels=["정상", "사기"], yticklabels=["정상", "사기"])
axes[0].set_title("혼동 행렬")
axes[0].set_xlabel("예측값")
axes[0].set_ylabel("실제값")

# PR 곡선 (불균형 데이터에서 ROC보다 유용)
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
pr_auc_val = auc(recall, precision)

axes[1].plot(recall, precision, label=f"PR-AUC = {pr_auc_val:.3f}")
axes[1].set_xlabel("Recall (재현율)")
axes[1].set_ylabel("Precision (정밀도)")
axes[1].set_title("Precision-Recall 곡선")
axes[1].legend()

# ROC 곡선
fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_auc_val = auc(fpr, tpr)

axes[2].plot(fpr, tpr, label=f"ROC-AUC = {roc_auc_val:.3f}")
axes[2].plot([0, 1], [0, 1], "k--")
axes[2].set_xlabel("False Positive Rate")
axes[2].set_ylabel("True Positive Rate")
axes[2].set_title("ROC 곡선")
axes[2].legend()

plt.tight_layout()
plt.show()

# 임계값에 따른 정밀도/재현율 트레이드오프
plt.figure(figsize=(10, 5))
plt.plot(thresholds, precision[:-1], label="Precision")
plt.plot(thresholds, recall[:-1], label="Recall")
plt.xlabel("임계값 (Threshold)")
plt.ylabel("점수")
plt.title("임계값에 따른 Precision-Recall 트레이드오프")
plt.legend()
plt.show()

불균형 데이터에서 정확도(Accuracy)는 의미 없는 지표입니다. 모든 샘플을 “정상”으로 예측해도 98%의 정확도를 얻습니다. 반드시 F1, PR-AUC, Recall 등을 사용합니다. 자세한 내용은 분류 평가 지표를 참고합니다.

Q: 비지도 이상 탐지와 지도학습 분류 중 어떤 것을 선택해야 하나요?

라벨이 있으면 지도학습(XGBoost + 불균형 처리)이 일반적으로 더 높은 성능을 보입니다. 라벨이 없거나 매우 적으면 비지도 이상 탐지(Isolation Forest, LOF)를 사용합니다. 실무에서는 두 접근을 결합하여 비지도 모델의 이상 점수를 지도학습 모델의 특성으로 추가하기도 합니다.

Q: 사기 탐지에서 정밀도와 재현율 중 무엇이 더 중요한가요?

비즈니스 상황에 따라 다릅니다. 사기 놓침의 비용이 크면 재현율(Recall)을 높이고, 정상 거래 차단의 불편이 크면 정밀도(Precision)를 높입니다. 임계값을 조정하여 비즈니스 요구에 맞는 트레이드오프를 설정합니다.

체크리스트

불균형 데이터에서 적절한 평가 지표를 선택할 수 있다
SMOTE 등 오버샘플링 기법을 적용할 수 있다
Isolation Forest로 비지도 이상 탐지를 수행할 수 있다
PR 곡선으로 임계값을 조정할 수 있다

다음 문서

시계열 예측

시계열 데이터의 예측 프로젝트를 수행합니다.

불균형 샘플링 레퍼런스

다양한 불균형 처리 알고리즘을 비교합니다.

​프로젝트 개요

​프로젝트 실습

​체크리스트

​다음 문서

시계열 예측

불균형 샘플링 레퍼런스

프로젝트 개요

프로젝트 실습

체크리스트

다음 문서