비용 최적화

LLM API의 비용은 토큰 수에 비례합니다. 프로덕션 환경에서 대량의 요청을 처리할 때 비용 최적화는 서비스의 경제적 지속 가능성을 결정짓습니다. 이 문서에서는 토큰 절약부터 모델 라우팅, 캐싱, 배치 처리까지 실전 비용 최적화 전략을 학습합니다.

학습 목표

이 문서를 완료하면 다음을 할 수 있습니다.

토큰 수를 정확하게 측정하고 비용을 추정할 수 있습니다
프롬프트 압축 기법으로 입력 토큰을 줄일 수 있습니다
시맨틱 캐싱으로 반복 요청의 비용을 절감할 수 있습니다
모델 라우팅 전략으로 비용 대비 성능을 최적화할 수 있습니다
Batch API를 활용하여 대량 처리 비용을 절감할 수 있습니다

왜 중요한가

LLM API 비용은 사용량에 비례하여 증가합니다. 최적화 없이 서비스를 운영하면 월간 수백만 원의 비용이 발생할 수 있습니다. 비용 계산 예시 (2024년 기준):

시나리오	일일 요청	평균 토큰/요청	모델	월간 비용 (추정)
소규모 챗봇	1,000건	2,000 토큰	GPT-4o-mini	~$18
중규모 서비스	10,000건	3,000 토큰	GPT-4o-mini	~$180
대규모 서비스	10,000건	3,000 토큰	GPT-4o	~$2,250
문서 분석	1,000건	10,000 토큰	GPT-4o	~$750

주요 LLM 가격 비교

모델	입력 (1M 토큰)	출력 (1M 토큰)	컨텍스트 길이	성능 수준
GPT-4o	$2.50	$10.00	128K	최상위
GPT-4o-mini	$0.15	$0.60	128K	가성비
Claude 3.5 Sonnet	$3.00	$15.00	200K	최상위
Claude 3.5 Haiku	$0.80	$4.00	200K	가성비
Llama 3.1 8B (자체 서빙)	인프라 비용	인프라 비용	128K	양호

가격은 2024년 기준이며, 제공업체의 가격 정책에 따라 변동될 수 있습니다. 최신 가격은 각 제공업체의 공식 문서를 확인합니다.

토큰 카운팅과 비용 추정

정확한 비용 관리를 위해 토큰 수를 사전에 측정하는 방법입니다.

import tiktoken

def count_tokens(text: str, model: str = "gpt-4o-mini") -> int:
    """텍스트의 토큰 수를 계산합니다."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

def estimate_cost(
    input_text: str,
    estimated_output_tokens: int,
    model: str = "gpt-4o-mini",
) -> dict:
    """API 호출 비용을 추정합니다."""
    # 모델별 가격 (1M 토큰 기준 USD)
    prices = {
        "gpt-4o": {"input": 2.50, "output": 10.00},
        "gpt-4o-mini": {"input": 0.15, "output": 0.60},
    }

    input_tokens = count_tokens(input_text, model)
    price = prices.get(model, prices["gpt-4o-mini"])

    input_cost = (input_tokens / 1_000_000) * price["input"]
    output_cost = (estimated_output_tokens / 1_000_000) * price["output"]
    total_cost = input_cost + output_cost

    return {
        "input_tokens": input_tokens,
        "estimated_output_tokens": estimated_output_tokens,
        "input_cost_usd": round(input_cost, 6),
        "output_cost_usd": round(output_cost, 6),
        "total_cost_usd": round(total_cost, 6),
        "total_cost_krw": round(total_cost * 1350, 2),  # 환율 1350원 기준
    }

# 사용 예시
prompt = "다음 문서를 요약하세요: " + "긴 문서 내용..." * 100
cost = estimate_cost(prompt, estimated_output_tokens=500, model="gpt-4o-mini")
print(f"입력 토큰: {cost['input_tokens']}")
print(f"예상 비용: ${cost['total_cost_usd']} (약 {cost['total_cost_krw']}원)")

메시지 히스토리의 토큰 비용 측정:

def count_messages_tokens(messages: list, model: str = "gpt-4o-mini") -> int:
    """OpenAI 메시지 배열의 총 토큰 수를 계산합니다."""
    encoding = tiktoken.encoding_for_model(model)
    total = 0

    for message in messages:
        # 메시지 구조 오버헤드 (약 4 토큰)
        total += 4
        for key, value in message.items():
            total += len(encoding.encode(str(value)))

    total += 2  # 응답 프라이밍 토큰
    return total

# 대화 히스토리 비용 확인
conversation = [
    {"role": "system", "content": "당신은 NLP 전문가입니다."},
    {"role": "user", "content": "BERT에 대해 설명해 주세요."},
    {"role": "assistant", "content": "BERT는 Google이 2018년에 발표한..."},
    {"role": "user", "content": "GPT와의 차이점은요?"},
]

tokens = count_messages_tokens(conversation)
print(f"대화 히스토리 토큰: {tokens}")

프롬프트 압축 기법

동일한 의미를 유지하면서 토큰 수를 줄이는 방법입니다.

def compress_system_prompt(verbose_prompt: str) -> str:
    """시스템 프롬프트를 압축합니다."""
    from openai import OpenAI
    client = OpenAI()

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": (
                    "시스템 프롬프트를 의미 변경 없이 최대한 간결하게 압축하세요. "
                    "핵심 지시사항만 남기고 불필요한 수식어를 제거합니다."
                )
            },
            {"role": "user", "content": f"압축 대상:\n{verbose_prompt}"},
        ],
    )
    return response.choices[0].message.content

# 압축 전후 비교
verbose = """
당신은 한국어 자연어 처리 전문가입니다. 사용자의 질문에 대해 정확하고 상세한 답변을 제공해 주세요.
답변할 때는 다음 규칙을 따라주세요:
1. 기술적으로 정확한 정보를 제공하세요
2. 한국어로 답변하되, 영어 기술 용어는 그대로 사용하세요
3. 코드 예시가 필요한 경우 Python으로 작성하세요
4. 불확실한 정보는 명시적으로 표시하세요
"""

compressed = """NLP 전문가. 한국어 답변(영문 기술 용어 유지). Python 코드 예시 포함. 불확실 정보 명시."""

before = count_tokens(verbose)
after = count_tokens(compressed)
print(f"압축 전: {before} 토큰 → 압축 후: {after} 토큰 ({(1 - after/before)*100:.0f}% 절감)")

대화 히스토리 압축 (요약 기반):

from openai import OpenAI

client = OpenAI()

def compress_conversation(messages: list, keep_recent: int = 4) -> list:
    """오래된 대화를 요약하고 최근 메시지만 유지합니다."""
    if len(messages) <= keep_recent + 1:  # system + recent
        return messages

    system_msg = messages[0]           # 시스템 프롬프트 유지
    old_messages = messages[1:-keep_recent]  # 요약 대상
    recent_messages = messages[-keep_recent:]  # 최근 메시지 유지

    # 오래된 대화 요약
    summary_response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": "대화 내용을 핵심 정보만 담아 3문장 이내로 요약하세요."
            },
            {
                "role": "user",
                "content": "\n".join(
                    f"{m['role']}: {m['content']}" for m in old_messages
                ),
            }
        ],
        max_tokens=200,
    )

    summary = summary_response.choices[0].message.content

    # 압축된 대화 구성
    compressed = [
        system_msg,
        {"role": "system", "content": f"[이전 대화 요약] {summary}"},
        *recent_messages,
    ]

    return compressed

캐싱 전략 (Semantic Cache)

동일하거나 유사한 질문에 대한 반복 API 호출을 줄이는 전략입니다.

import hashlib
import json
import numpy as np
from openai import OpenAI
from typing import Optional

client = OpenAI()

class SemanticCache:
    """임베딩 기반 시맨틱 캐시"""

    def __init__(self, similarity_threshold: float = 0.95):
        self.cache: list[dict] = []  # 실제로는 벡터 DB 사용
        self.threshold = similarity_threshold

    def _get_embedding(self, text: str) -> list[float]:
        """텍스트의 임베딩 벡터를 생성합니다."""
        response = client.embeddings.create(
            model="text-embedding-3-small",
            input=text,
        )
        return response.data[0].embedding

    def _cosine_similarity(self, a: list[float], b: list[float]) -> float:
        """코사인 유사도를 계산합니다."""
        a, b = np.array(a), np.array(b)
        return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

    def get(self, query: str) -> Optional[str]:
        """캐시에서 유사한 질문의 응답을 찾습니다."""
        query_embedding = self._get_embedding(query)

        best_match = None
        best_score = 0.0

        for entry in self.cache:
            score = self._cosine_similarity(query_embedding, entry["embedding"])
            if score > best_score:
                best_score = score
                best_match = entry

        if best_match and best_score >= self.threshold:
            print(f"  [캐시 적중] 유사도: {best_score:.4f}")
            return best_match["response"]

        return None

    def set(self, query: str, response: str):
        """캐시에 질문-응답 쌍을 저장합니다."""
        embedding = self._get_embedding(query)
        self.cache.append({
            "query": query,
            "embedding": embedding,
            "response": response,
        })

# 사용 예시
cache = SemanticCache(similarity_threshold=0.92)

def cached_chat(query: str) -> str:
    """캐시를 활용한 LLM 호출"""
    # 1. 캐시 확인
    cached = cache.get(query)
    if cached:
        return cached

    # 2. 캐시 미스 - API 호출
    print("  [API 호출]")
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": query}],
    )
    answer = response.choices[0].message.content

    # 3. 캐시 저장
    cache.set(query, answer)
    return answer

# 테스트
print(cached_chat("BERT가 무엇인가요?"))         # API 호출
print(cached_chat("BERT란 무엇인가요?"))          # 캐시 적중 (유사 질문)
print(cached_chat("Transformer가 무엇인가요?"))   # API 호출 (다른 질문)

모델 선택과 라우팅

질문의 복잡도에 따라 적절한 모델을 자동으로 선택하는 전략입니다.

from openai import OpenAI

client = OpenAI()

def classify_complexity(query: str) -> str:
    """질문의 복잡도를 분류합니다."""
    response = client.chat.completions.create(
        model="gpt-4o-mini",  # 분류는 저렴한 모델 사용
        response_format={"type": "json_object"},
        messages=[
            {
                "role": "system",
                "content": (
                    "질문의 복잡도를 분류하세요. JSON 형식: {\"complexity\": \"simple|moderate|complex\"}\n"
                    "simple: 사실 확인, 정의, 간단한 설명\n"
                    "moderate: 비교 분석, 단계별 설명, 중간 수준 추론\n"
                    "complex: 심층 분석, 설계 판단, 복잡한 추론, 창의적 작업"
                )
            },
            {"role": "user", "content": query},
        ],
        max_tokens=50,
    )

    import json
    result = json.loads(response.choices[0].message.content)
    return result["complexity"]

def smart_route(query: str) -> str:
    """복잡도에 따라 최적 모델을 선택하여 응답합니다."""
    # 모델 라우팅 테이블
    model_map = {
        "simple": "gpt-4o-mini",       # $0.15/1M 입력
        "moderate": "gpt-4o-mini",      # $0.15/1M 입력
        "complex": "gpt-4o",            # $2.50/1M 입력
    }

    complexity = classify_complexity(query)
    model = model_map[complexity]
    print(f"  복잡도: {complexity} → 모델: {model}")

    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": query}],
    )
    return response.choices[0].message.content

# 테스트
print(smart_route("BERT가 뭐예요?"))  # simple → gpt-4o-mini
print(smart_route("마이크로서비스 아키텍처에서 RAG 시스템의 확장성을 분석해 주세요."))  # complex → gpt-4o

Batch API 활용

대량의 요청을 배치로 처리하면 50% 이상의 비용을 절감할 수 있습니다.

import json
from openai import OpenAI

client = OpenAI()

def create_batch_requests(queries: list[str], model: str = "gpt-4o-mini") -> str:
    """배치 요청 파일을 생성합니다."""
    requests = []
    for i, query in enumerate(queries):
        requests.append({
            "custom_id": f"request-{i}",
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": model,
                "messages": [{"role": "user", "content": query}],
                "max_tokens": 500,
            }
        })

    # JSONL 파일로 저장
    filepath = "batch_requests.jsonl"
    with open(filepath, "w") as f:
        for req in requests:
            f.write(json.dumps(req, ensure_ascii=False) + "\n")

    return filepath

def submit_batch(filepath: str) -> str:
    """배치 작업을 제출합니다."""
    # 1. 파일 업로드
    with open(filepath, "rb") as f:
        batch_file = client.files.create(file=f, purpose="batch")

    # 2. 배치 작업 생성
    batch = client.batches.create(
        input_file_id=batch_file.id,
        endpoint="/v1/chat/completions",
        completion_window="24h",  # 24시간 이내 완료
    )

    print(f"배치 ID: {batch.id}")
    print(f"상태: {batch.status}")
    return batch.id

def check_batch_status(batch_id: str) -> dict:
    """배치 작업 상태를 확인합니다."""
    batch = client.batches.retrieve(batch_id)
    return {
        "status": batch.status,
        "total": batch.request_counts.total,
        "completed": batch.request_counts.completed,
        "failed": batch.request_counts.failed,
    }

# 사용 예시
queries = [
    "BERT의 핵심 아이디어를 설명하세요.",
    "GPT와 BERT의 차이점은 무엇인가요?",
    "Transformer의 Self-Attention을 설명하세요.",
    # ... 수백 개의 쿼리
]

filepath = create_batch_requests(queries)
batch_id = submit_batch(filepath)

Batch API는 일반 API 대비 50% 할인됩니다. 실시간 응답이 필요 없는 작업(데이터 분석, 대량 분류, 콘텐츠 생성 등)에 적합합니다.