"""Word weighting for sparse vectors (IDF-like)."""

# Common words that appear in almost every API endpoint - low weight
# Scale: 0.1 (very common) to 1.0 (normal)
WORD_WEIGHTS: dict[str, float] = {
    # Extremely common - almost every endpoint has these
    "document": 0.2,
    "documents": 0.2,
    "id": 0.1,
    "the": 0.1,
    "and": 0.1,
    "for": 0.1,
    "with": 0.1,
    "this": 0.1,
    "that": 0.1,
    "from": 0.1,
    "are": 0.1,
    "can": 0.1,
    "will": 0.1,
    "user": 0.3,
    "users": 0.3,
    "api": 0.2,
    "request": 0.3,
    "response": 0.3,
    "data": 0.3,
    "object": 0.2,
    "array": 0.2,
    "string": 0.2,
    "type": 0.2,
    "field": 0.3,
    "fields": 0.3,
    "value": 0.3,
    "values": 0.3,
    "name": 0.3,
    "endpoint": 0.2,

    # Common HTTP/API terms
    "get": 0.4,
    "post": 0.4,
    "put": 0.4,
    "delete": 0.5,
    "patch": 0.5,
    "list": 0.4,
    "create": 0.5,
    "update": 0.5,
    "read": 0.4,
    "returns": 0.3,
    "return": 0.3,

    # Domain-specific but common in SignNow
    "group": 0.4,
    "template": 0.5,
    "templates": 0.5,
    "folder": 0.5,
    "folders": 0.5,

    # High value - specific action words (keep at 1.0 or boost)
    "invite": 1.0,
    "sign": 1.0,
    "signature": 1.0,
    "send": 1.0,
    "email": 0.8,
    "sms": 1.0,
    "fax": 1.0,
    "merge": 1.0,
    "download": 1.0,
    "upload": 1.0,
    "cancel": 1.0,
    "resend": 1.0,
    "reminder": 1.0,
    "embedded": 1.0,
    "freeform": 1.0,
    "payment": 1.0,
}

# Stopwords to completely ignore
STOPWORDS: set[str] = {
    "a", "an", "as", "at", "be", "by", "if", "in", "is", "it",
    "of", "on", "or", "to", "we", "so", "do", "no", "up",
}


def get_word_weight(word: str) -> float:
    """Get IDF-like weight for a word.

    Args:
        word: Lowercase word to get weight for.

    Returns:
        Weight between 0.1 and 1.0. Returns 0.0 for stopwords.
    """
    word = word.lower()

    if word in STOPWORDS:
        return 0.0

    return WORD_WEIGHTS.get(word, 0.7)  # Default weight for unknown words
