Newer
Older
cortex-hub / ai-hub / app / core / pipelines / validator.py
import tiktoken
import json
from typing import Dict, Any

class TokenLimitExceededError(Exception):
    """Custom exception raised when the input payload exceeds the token limit."""

    def __init__(self, message: str, token_count: int, token_limit: int):
        super().__init__(message)
        self.token_count = token_count
        self.token_limit = token_limit


class Validator:
    def __init__(self, token_limit: int = 100000, encoding_name: str = "cl100k_base"):
        """
        Initializes the Validator with a token limit and encoding.

        Args:
            token_limit (int): The maximum number of tokens allowed.
            encoding_name (str): The name of the tokenizer encoding to use.
        """
        self.token_limit = token_limit
        self.encoding = tiktoken.get_encoding(encoding_name=encoding_name)

    def precheck_tokensize(self, input_payload: Dict[str, Any]) -> None:
        """
        Checks if the input payload's token count exceeds the configured limit.

        Args:
            input_payload (Dict[str, Any]): The payload to be checked.

        Raises:
            TokenLimitExceededError: If the payload's token count is too high.
        """
        payload_string: str = json.dumps(input_payload)
        token_count: int = len(self.encoding.encode(payload_string))
        
        if token_count > self.token_limit:
            raise TokenLimitExceededError(
                f"Input payload token count ({token_count}) exceeds the limit of {self.token_limit} tokens.",
                token_count,
                self.token_limit,
            )