텍스트 분류 실습

LSTM을 사용하여 영화 리뷰의 감성(긍정/부정)을 분류하는 모델을 구현합니다.

데이터 준비

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from collections import Counter

# 예시 데이터 (실제로는 IMDB 등 대규모 데이터셋 사용)
texts = [
    "this movie is great and amazing",
    "terrible film waste of time",
    "excellent acting wonderful story",
    "boring and predictable plot",
    "best movie i have ever seen",
    "awful movie do not watch",
]
labels = [1, 0, 1, 0, 1, 0]  # 1: 긍정, 0: 부정

# 간단한 토크나이저 및 어휘 구축
def build_vocab(texts, min_freq=1):
    counter = Counter()
    for text in texts:
        counter.update(text.lower().split())
    vocab = {'<PAD>': 0, '<UNK>': 1}
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)
    return vocab

vocab = build_vocab(texts)
print(f"어휘 크기: {len(vocab)}")

def encode_text(text, vocab, max_len=20):
    """텍스트를 토큰 인덱스 시퀀스로 변환"""
    tokens = text.lower().split()
    indices = [vocab.get(t, vocab['<UNK>']) for t in tokens]
    # 패딩 또는 절삭
    if len(indices) < max_len:
        indices += [vocab['<PAD>']] * (max_len - len(indices))
    else:
        indices = indices[:max_len]
    return indices

class TextDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len=20):
        self.data = [encode_text(t, vocab, max_len) for t in texts]
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx]), torch.tensor(self.labels[idx])

dataset = TextDataset(texts, labels, vocab)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

모델 정의

class LSTMClassifier(nn.Module):
    """LSTM 기반 텍스트 분류기"""
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes,
                 num_layers=1, bidirectional=True, dropout=0.3, pad_idx=0):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(
            embed_dim, hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional,
            dropout=dropout if num_layers > 1 else 0,
        )
        direction_factor = 2 if bidirectional else 1
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * direction_factor, num_classes)

    def forward(self, x):
        # x: (배치, 시퀀스)
        embedded = self.dropout(self.embedding(x))   # (배치, 시퀀스, embed_dim)
        output, (h_n, c_n) = self.lstm(embedded)

        # 양방향 LSTM의 마지막 은닉 상태 결합
        if self.lstm.bidirectional:
            hidden = torch.cat([h_n[-2], h_n[-1]], dim=1)
        else:
            hidden = h_n[-1]

        hidden = self.dropout(hidden)
        logits = self.fc(hidden)
        return logits

# 모델 생성
model = LSTMClassifier(
    vocab_size=len(vocab),
    embed_dim=64,
    hidden_dim=128,
    num_classes=2,
    num_layers=2,
    bidirectional=True,
    dropout=0.3,
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

total_params = sum(p.numel() for p in model.parameters())
print(f"파라미터 수: {total_params:,}")

학습

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch_text, batch_label in dataloader:
        batch_text = batch_text.to(device)
        batch_label = batch_label.to(device)

        optimizer.zero_grad()
        output = model(batch_text)
        loss = criterion(output, batch_label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}: Loss = {total_loss/len(dataloader):.4f}")

추론

def predict_sentiment(text, model, vocab, device):
    """텍스트의 감성을 예측"""
    model.eval()
    encoded = encode_text(text, vocab)
    input_tensor = torch.tensor([encoded]).to(device)

    with torch.no_grad():
        output = model(input_tensor)
        prob = torch.softmax(output, dim=1)
        pred = output.argmax(dim=1).item()

    sentiment = "긍정" if pred == 1 else "부정"
    confidence = prob[0][pred].item()
    return sentiment, confidence

# 테스트
test_texts = [
    "this movie is absolutely wonderful",
    "terrible and boring waste of time",
]

for text in test_texts:
    sentiment, confidence = predict_sentiment(text, model, vocab, device)
    print(f"'{text}' → {sentiment} ({confidence:.2%})")

실제 프로젝트에서는 torchtext나 Hugging Face datasets를 사용하여 IMDB, SST-2 등 대규모 감성 분석 데이터셋을 로드합니다. 사전학습 임베딩(GloVe, Word2Vec)을 사용하면 성능이 크게 향상됩니다.

체크리스트

텍스트를 토큰 인덱스로 변환하는 전처리 과정을 이해한다
LSTM 기반 분류기의 구조(Embedding → LSTM → FC)를 구현할 수 있다
양방향 LSTM의 은닉 상태를 결합하는 방법을 안다
padding_idx의 역할을 이해한다

체크리스트

다음 문서

학습 실무 기법

단어 임베딩

​체크리스트

​다음 문서

학습 실무 기법

단어 임베딩

체크리스트

다음 문서