Bot WA AI Cost Optimization

Cara menghemat biaya AI untuk WhatsApp bot. Model selection, caching, token optimization. Panduan lengkap!

Bot WA AI Cost Optimization
Bot WA AI Cost Optimization

AI bisa mahal jika tidak dikelola!

Tanpa optimasi, biaya AI bisa membengkak dengan cepat. Panduan ini membantu kamu tetap hemat tanpa mengorbankan kualitas.


Memahami AI Pricing

💰 PRICING STRUCTURE:

OPENAI:
┌─────────────────┬──────────┬───────────┐
│ Model           │ Input    │ Output    │
├─────────────────┼──────────┼───────────┤
│ GPT-4o          │ $2.50/M  │ $10.00/M  │
│ GPT-4o-mini     │ $0.15/M  │ $0.60/M   │
│ GPT-3.5-turbo   │ $0.50/M  │ $1.50/M   │
└─────────────────┴──────────┴───────────┘
M = 1 Million tokens

ANTHROPIC (Claude):
┌─────────────────┬──────────┬───────────┐
│ Model           │ Input    │ Output    │
├─────────────────┼──────────┼───────────┤
│ Claude 3.5 Son. │ $3.00/M  │ $15.00/M  │
│ Claude 3 Haiku  │ $0.25/M  │ $1.25/M   │
└─────────────────┴──────────┴───────────┘

GOOGLE (Gemini):
- Gemini Pro: Free tier generous
- Gemini 1.5: $0.35-$1.05/M

Strategy 1: Model Selection

Tiered Model Approach:

javascript

// Use different models for different tasks
const MODEL_TIERS = {
    simple: 'gpt-4o-mini',      // FAQ, simple queries
    standard: 'gpt-4o-mini',    // Most conversations
    complex: 'gpt-4o',          // Complex reasoning
    creative: 'gpt-4o'          // Creative tasks
};

function selectModel(taskType, messageLength, conversationDepth) {
    // Simple FAQ or short messages
    if (taskType === 'faq' || messageLength < 50) {
        return MODEL_TIERS.simple;
    }
    
    // Complex multi-turn conversations
    if (conversationDepth > 5 || taskType === 'complex') {
        return MODEL_TIERS.complex;
    }
    
    // Default to standard
    return MODEL_TIERS.standard;
}

async function chat(userId, message) {
    const context = await getConversationContext(userId);
    const taskType = classifyTask(message);
    
    const model = selectModel(
        taskType,
        message.length,
        context.messages.length
    );
    
    return await openai.chat.completions.create({
        model,
        messages: context.messages
    });
}

Task Classification:

javascript

// Quick classification tanpa API call
function classifyTask(message) {
    const lowerMessage = message.toLowerCase();
    
    // Simple queries
    const simplePatterns = [
        /^(hai|halo|hi|hello)/,
        /jam (buka|operasional)/,
        /alamat/,
        /harga .{0,20}$/,
        /^(ya|ok|oke|siap|makasih)/
    ];
    
    if (simplePatterns.some(p => p.test(lowerMessage))) {
        return 'simple';
    }
    
    // Complex queries
    const complexPatterns = [
        /bandingkan|compare/,
        /jelaskan.*detail/,
        /analisis|analisa/,
        /rekomendasi.{20,}/
    ];
    
    if (complexPatterns.some(p => p.test(lowerMessage))) {
        return 'complex';
    }
    
    return 'standard';
}

Strategy 2: Caching

Response Caching:

javascript

const NodeCache = require('node-cache');
const crypto = require('crypto');

// Cache with 1 hour TTL
const responseCache = new NodeCache({ stdTTL: 3600 });

function generateCacheKey(messages) {
    // Hash the last user message + context
    const content = messages.map(m => m.content).join('|');
    return crypto.createHash('md5').update(content).digest('hex');
}

async function cachedChat(messages) {
    const cacheKey = generateCacheKey(messages);
    
    // Check cache
    const cached = responseCache.get(cacheKey);
    if (cached) {
        console.log('Cache hit!');
        return cached;
    }
    
    // Generate new response
    const response = await openai.chat.completions.create({
        model: 'gpt-4o-mini',
        messages
    });
    
    const result = response.choices[0].message.content;
    
    // Cache the response
    responseCache.set(cacheKey, result);
    
    return result;
}

Semantic Caching:

javascript

// Cache similar questions, not just exact matches
async function semanticCache(userMessage) {
    // Get embedding for user message
    const embedding = await createEmbedding(userMessage);
    
    // Search for similar cached queries
    const similar = await db.queryCache.aggregate([
        {
            $vectorSearch: {
                index: 'cache_vector_index',
                path: 'embedding',
                queryVector: embedding,
                numCandidates: 10,
                limit: 1
            }
        },
        {
            $match: {
                score: { $gte: 0.95 } // Very high similarity
            }
        }
    ]).toArray();
    
    if (similar.length > 0) {
        console.log('Semantic cache hit!');
        return similar[0].response;
    }
    
    // Generate new response
    const response = await generateResponse(userMessage);
    
    // Cache with embedding
    await db.queryCache.insertOne({
        query: userMessage,
        embedding,
        response,
        createdAt: new Date()
    });
    
    return response;
}

Strategy 3: Token Optimization

Shorter System Prompts:

javascript

// ❌ BAD: Long verbose prompt
const BAD_PROMPT = `
Kamu adalah customer service AI yang sangat helpful dan ramah 
untuk toko online fashion wanita bernama [BRAND]. Kamu harus 
selalu menjawab dengan sopan dan menggunakan bahasa Indonesia 
yang baik dan benar. Kamu juga harus menggunakan emoji yang 
sesuai untuk membuat percakapan lebih friendly. Jangan lupa 
untuk selalu menawarkan bantuan di akhir setiap response...
[500+ words]
`;

// ✅ GOOD: Concise effective prompt
const GOOD_PROMPT = `CS AI untuk [BRAND] (fashion wanita).
Bahasa: Indonesia casual, pakai "Kak".
Tone: Ramah, helpful, emoji secukupnya.
Rules: Jawab singkat, akurat, tawarkan bantuan lain.`;

Trim Conversation History:

javascript

function trimConversationHistory(messages, maxTokens = 2000) {
    // Estimate tokens (rough: 1 token ≈ 4 chars Indonesian)
    const estimateTokens = (text) => Math.ceil(text.length / 4);
    
    let totalTokens = 0;
    const trimmedMessages = [];
    
    // Always keep system prompt
    const systemMessage = messages.find(m => m.role === 'system');
    if (systemMessage) {
        totalTokens += estimateTokens(systemMessage.content);
        trimmedMessages.push(systemMessage);
    }
    
    // Add messages from newest to oldest until limit
    const nonSystemMessages = messages
        .filter(m => m.role !== 'system')
        .reverse();
    
    for (const msg of nonSystemMessages) {
        const msgTokens = estimateTokens(msg.content);
        
        if (totalTokens + msgTokens > maxTokens) {
            break;
        }
        
        totalTokens += msgTokens;
        trimmedMessages.unshift(msg);
    }
    
    return trimmedMessages;
}

Summarize Long Contexts:

javascript

async function summarizeContext(messages) {
    if (messages.length < 10) return messages;
    
    // Summarize older messages
    const olderMessages = messages.slice(0, -4);
    const recentMessages = messages.slice(-4);
    
    const summary = await openai.chat.completions.create({
        model: 'gpt-4o-mini', // Use cheap model for summarization
        messages: [{
            role: 'user',
            content: `Summarize this conversation in 2-3 sentences:
${olderMessages.map(m => `${m.role}: ${m.content}`).join('\n')}`
        }],
        max_tokens: 150
    });
    
    return [
        { role: 'system', content: 'Previous context: ' + summary.choices[0].message.content },
        ...recentMessages
    ];
}

Strategy 4: Hybrid Approach

javascript

// Use rules for simple queries, AI for complex
async function hybridChat(userId, message) {
    // 1. Check for exact keyword matches first
    const keywordResponse = checkKeywordRules(message);
    if (keywordResponse) {
        return keywordResponse; // FREE!
    }
    
    // 2. Check cache
    const cachedResponse = await semanticCache(message);
    if (cachedResponse) {
        return cachedResponse; // FREE!
    }
    
    // 3. Check FAQ database
    const faqResponse = await searchFAQ(message);
    if (faqResponse && faqResponse.confidence > 0.9) {
        return faqResponse.answer; // Minimal cost (just embedding)
    }
    
    // 4. Fall back to AI
    const taskType = classifyTask(message);
    const model = selectModel(taskType);
    
    return await generateAIResponse(userId, message, model);
}

function checkKeywordRules(message) {
    const rules = {
        'jam buka': 'Kami buka setiap hari jam 08:00-21:00 WIB 🕐',
        'alamat': 'Alamat kami: Jl. Contoh No. 123, Jakarta 📍',
        'transfer ke': 'Transfer ke BCA 1234567890 a/n [BRAND] 🏦',
        // ... more rules
    };
    
    const lowerMessage = message.toLowerCase();
    for (const [keyword, response] of Object.entries(rules)) {
        if (lowerMessage.includes(keyword)) {
            return response;
        }
    }
    
    return null;
}

Strategy 5: Rate Limiting & Quotas

javascript

const userQuotas = new Map();

const DAILY_QUOTA = {
    free: 20,      // 20 AI calls/day
    basic: 100,    // 100 AI calls/day
    premium: 500   // 500 AI calls/day
};

async function checkQuota(userId) {
    const user = await db.users.findOne({ oderId
userId });
    const tier = user?.tier || 'free';
    const quota = DAILY_QUOTA[tier];
    
    const today = new Date().toDateString();
    const userUsage = userQuotas.get(userId) || { date: today, count: 0 };
    
    // Reset if new day
    if (userUsage.date !== today) {
        userUsage.date = today;
        userUsage.count = 0;
    }
    
    if (userUsage.count >= quota) {
        return {
            allowed: false,
            message: 'Kuota harian tercapai. Upgrade ke premium untuk unlimited!'
        };
    }
    
    userUsage.count++;
    userQuotas.set(userId, userUsage);
    
    return { allowed: true };
}

Cost Monitoring

javascript

// Track costs per user/feature
async function trackAPIUsage(userId, model, inputTokens, outputTokens) {
    const costs = {
        'gpt-4o': { input: 0.0025, output: 0.01 },
        'gpt-4o-mini': { input: 0.00015, output: 0.0006 },
        'claude-3-5-sonnet': { input: 0.003, output: 0.015 }
    };
    
    const modelCost = costs[model] || costs['gpt-4o-mini'];
    const totalCost = (inputTokens * modelCost.input + outputTokens * modelCost.output) / 1000;
    
    await db.usageLogs.insertOne({
        userId,
        model,
        inputTokens,
        outputTokens,
        cost: totalCost,
        timestamp: new Date()
    });
}

// Daily cost report
async function getDailyCostReport() {
    const today = new Date();
    today.setHours(0, 0, 0, 0);
    
    const stats = await db.usageLogs.aggregate([
        { $match: { timestamp: { $gte: today } } },
        {
            $group: {
                _id: '$model',
                totalCost: { $sum: '$cost' },
                totalCalls: { $sum: 1 },
                totalInputTokens: { $sum: '$inputTokens' },
                totalOutputTokens: { $sum: '$outputTokens' }
            }
        }
    ]).toArray();
    
    return stats;
}

Cost Comparison

💰 CONTOH BIAYA BULANAN (1000 chats/hari):

TANPA OPTIMASI:
- All GPT-4o
- No caching
- Long prompts
- Full history
≈ $300-500/bulan 😱

DENGAN OPTIMASI:
- Tiered models
- Semantic caching (50% hit)
- Short prompts
- Trimmed history
- Hybrid rules
≈ $30-50/bulan 🎉

SAVINGS: 90%!

Best Practices

DO ✅

- Use cheapest model that works
- Cache aggressively
- Trim conversation history
- Monitor costs daily
- Set user quotas
- Use rules for simple queries

DON'T ❌

- Always use most expensive model
- No caching
- Keep full history forever
- Ignore cost monitoring
- Unlimited usage
- AI for everything

FAQ

GPT-4o-mini cukup bagus?

Untuk 80% use cases, ya! Complex reasoning pakai GPT-4o.

Berapa budget reasonable?

Mulai $50-100/bulan untuk small business dengan 500-1000 chats/hari.


Kesimpulan

Optimasi = Same quality, 90% cheaper!

No OptimizationOptimized
$300-500/mo$30-50/mo
Single modelTiered
No cachingSmart caching

Optimize Your Bot →


Artikel Terkait