Newer
Older
cortex-hub / ai-hub / app / core / pipelines / question_decider.py
import dspy
import json
import os
from app.db import models
from typing import List, Dict, Any, Tuple, Optional
from typing import List, Callable, Optional

class QuestionDecider(dspy.Signature):
    """
    You are a highly specialized AI assistant for software engineering tasks. Your role is to analyze a user's request and the provided codebase to decide on the best course of action: provide an answer, suggest a code change, or ask for more files. Your decisions must be based **strictly and exclusively** on the provided content.

    ---

    ### ­ЪДа Core Directives:

    1.  **Analyze the Request and Available Data:**
        * Examine the `question` and `chat_history` to understand the user's intent.
        * You are provided with two distinct lists of files: `retrieved_paths_with_content` (files you have and can use) and `retrieved_paths_without_content` (files you know exist but need to request their content).
        * **Crucial Rule:** The `retrieved_paths_with_content` is your complete and only source of usable code information. Do not mention or refer to any code that is not explicitly present in this data.

    2.  **Determine File Requirements:**
        * Identify any specific file paths mentioned by the user or required to fulfill the request.
        * **Do not re-request files that you already have, "have" means your requested file path is already existed in `retrieved_paths_with_content`**
        * A file is considered "missing" only if its path is not in `retrieved_paths_with_content` and is either mentioned in the request or is required for a code-change. This is the only valid reason to choose `decision='files'`. The `retrieved_paths_without_content` list helps you identify what files are candidates to request.
        * **Crucial New Rule:** If a file path mentioned by the user is **not found** in either `retrieved_paths_with_content` or `retrieved_paths_without_content`, you must choose the 'answer' decision and explain that the file could not be found. Do not request it.

    3.  **Choose the Correct Decision Path:**
        * **Decision: 'answer'**
            * Choose this if you have all the necessary information in `retrieved_paths_with_content` to provide a full, complete, and comprehensive explanation for a non-code-modification question.
            * Also choose this if the user asks about a file that is not present in any of the provided data. You must explain to the user why the file could not be found.
            * The `answer` field must contain a detailed, well-structured explanation in Markdown.
            * The `code_diff` field must be empty.

        * **Decision: 'code_change'**
            * Choose this if the user's request involves modifying or adding to the code (e.g., "fix this bug," "implement this feature," "refactor this function", "show me full code"). 
            * This decision is also for requests to **generate new code** (e.g., creating a new file from scratch). If the user asks for the "full code" of a file that doesn't exist, this is a code generation task. 
            * You must have all the relevant files with content in `retrieved_paths_with_content` to propose the change.
            * The `answer` field can be an optional, high-level summary of the change.
            * The `code_diff` field must contain the full and complete git diff showing the exact modifications, including adding new files.

        * **Decision: 'files'**
            * Choose this **only if** you need more files to fulfill the user's request.
            * The `answer` field must be a valid JSON list of strings, with each string being an explicit, complete file path that is **found in the `retrieved_paths_without_content` list.** **Do not use wildcard characters like `*` or `?`.**
            * The `code_diff` field must be empty.

    4.  **Final Output Requirements:**
        * Your output must be a valid JSON object matching the schema.
        * **Crucial New Rule:** Do not mention internal system variables or the DSPy signature fields (`retrieved_paths_with_content`, `retrieved_paths_without_content`) in the `reasoning`, `answer`, or `code_diff` fields. The output should be user-friendly and not leak implementation details.
        * Be helpful, precise, and adhere strictly to these rules. Do not hallucinate file paths or content.
    ---
    """
    question = dspy.InputField(desc="The user's current question.")
    chat_history = dspy.InputField(desc="The ongoing dialogue between the user and the AI.")
    
    # New Input Fields to make the data split explicit
    retrieved_paths_with_content = dspy.InputField(desc="A JSON string of files that have been successfully retrieved with their full content.")
    retrieved_paths_without_content = dspy.InputField(desc="A JSON string of files that have been found but their content has not yet been loaded (e.g., empty or null content).")

    reasoning = dspy.OutputField(
        desc="First, determine if the artifacts are sufficient. Then, based on the question, the decision type should be either 'code_change' or 'answer'. Finally, analyze the question and determine the output fields."
    )

    decision = dspy.OutputField(
        desc="Must be one of: 'answer', 'files', or 'code_change'."
    )
    code_diff = dspy.OutputField(
        desc=(
            "If `decision` is 'code_change': the full, complete git diff of the proposed changes.\n"
            "Leave this field empty if the decision is not 'code_change'."
        )
    )
    answer = dspy.OutputField(
        desc=(
            "If `decision` is 'answer': a comprehensive, well-structured explanation in Markdown.\n"
            "If `decision` is 'files': a JSON-formatted list of required file paths.\n"
            "If `decision` is 'code_change': an optional, high-level summary of the proposed changes. Leave empty if no summary is needed."
        )
    )

class CodeRagQuestionDecider(dspy.Module):

    def __init__(self, log_dir: str = "ai_payloads", history_formatter: Optional[Callable[[List[models.Message]], str]] = None):
        super().__init__()
        self.log_dir = log_dir
        # Initializes the dspy Predict module with the refined system prompt
        self.decider = dspy.ChainOfThought(QuestionDecider)
        self.history_formatter = history_formatter or self._default_history_formatter


    def _default_history_formatter(self, history: List[models.Message]) -> str:
        return "\n".join(
            f"{'Human' if msg.sender == 'user' else 'Assistant'}: {msg.content}"
            for msg in history
        )
    
    async def forward(
        self,
        question: str,
        history:  List[models.Message],
        retrieved_data: Dict[str, Any]
    ) -> Tuple[str, str, str]:
        """
        Runs the decision model with the current user input and code context.

        Args:
            question: The user's query.
            history: The chat history as a list of strings.
            retrieved_data: A dictionary mapping file paths to file contents.

        Returns:
            A tuple of (answer, decision, code_diff).
        """
        
        # --- INTERNAL LOGIC TO SPLIT DATA, WITH NULL/POINTER CHECKS ---
        with_content = []
        without_content = []

        # Safely access the 'retrieved_files' key, defaulting to an empty list
        files_to_process = retrieved_data.get("retrieved_files", [])
        if not isinstance(files_to_process, list):
            # Fallback for unexpected data format
            files_to_process = []

        for file in files_to_process:
            # Check if 'file' is not None and is a dictionary
            if isinstance(file, dict):
                file_path = file.get("file_path")
                file_content = file.get("content")

                # Check if file_content is a non-empty string
                if file_content and isinstance(file_content, str):
                    with_content.append({"file_path": file_path, "content": file_content})
                # Check for a file path without content
                elif file_path:
                    without_content.append({"file_path": file_path})

        # Ensure valid JSON strings for the model input
        retrieved_with_content_json = json.dumps(with_content, indent=2)
        retrieved_without_content_json = json.dumps(without_content, indent=2)

        history_text = self.history_formatter(history)
        input_payload = {
            "question": question,
            "chat_history": history_text,
            "retrieved_paths_with_content": retrieved_with_content_json,
            "retrieved_paths_without_content": retrieved_without_content_json,
        }

        prediction = await self.decider.acall(**input_payload)

        # Defensive handling and a clean way to access prediction fields
        decision = getattr(prediction, "decision", "").lower()
        answer = getattr(prediction, "answer", "")
        code_diff = getattr(prediction, "code_diff", "")
        reasoning = getattr(prediction, "reasoning", "")

        return answer, reasoning, decision, code_diff