Context Agent evaluation

In this example, we will walk through a Context Augmentation RAG Agent application and evaluate it’s performance.

Try out full example at example repo.

from continuous_eval.eval import Module, AgentModule, Pipeline, Tool, Dataset, ModuleOutput
from continuous_eval.metrics.generation.text import DeterministicAnswerCorrectness
from continuous_eval.metrics.tools.match import ToolSelectionAccuracy
from continuous_eval.eval.tests import GreaterOrEqualThan

dataset = Dataset("examples/llama_index/context_augmentation/data")

tools = [
    Tool(
        name="march",
        args={"input": str},
        out_type=str,
    ),
    Tool(
        name="june",
        args={"input": str},
        out_type=str,
    ),
    Tool(
        name="sept",
        args={"input": str},
        out_type=str,
    ),
]

agent = AgentModule(
    name="retriever_agent",
    input=dataset.question,
    output=str,
    eval=[
        ToolSelectionAccuracy().use(
            tools=tools, ground_truths=dataset.tool_calls
        ),
    ],
    tests=[
        GreaterOrEqualThan(
            test_name="Tool Selection Accuracy", metric_name="score", min_value=0.8
        ),
    ],
)

output = Module(
    name="answer",
    input=agent,
    output=str,
    eval=[
        DeterministicAnswerCorrectness().use(answer=ModuleOutput(), ground_truth=dataset.answer),
    ],
    tests=[
        GreaterOrEqualThan(
            test_name="Answer Correctness", metric_name="rouge_l_recall", min_value=0.8
        ),
    ],)

pipeline = Pipeline([agent, output], dataset=dataset)

print(pipeline.graph_repr())

name: Uber 10Q
description: Uber 10Q filings from 2022
format: jsonl
license: CC0
fields:
  uuid:
    description: Unique identifier for the filing
    type: UUID
  question:
    description: The question asked in the filing
    type: str
  answer:
    description: The answer to the question
    type: List[str]
  tool_calls:
    description: The tools used to extract the question and answer
    type: List[Dict[str, str]]

from typing import Any

from llama_index.agent.openai_legacy import ContextRetrieverOpenAIAgent
from llama_index.core import (
    Document,
    SimpleDirectoryReader,
    StorageContext,
    VectorStoreIndex,
    load_index_from_storage,
)
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.tools.types import ToolMetadata, ToolOutput
from loguru import logger

from continuous_eval.eval.manager import eval_manager
from examples.llama_index.context_augmentation.pipeline import dataset, pipeline

eval_manager.set_pipeline(pipeline)

VERBOSE = False


class LoggableQueryEngineTool(QueryEngineTool):
    def call(self, *args: Any, **kwargs: Any) -> ToolOutput:
        logger.info(
            f"Calling {self.metadata.name} with args: {args} and kwargs: {kwargs}"
        )
        ret = super().call(*args, **kwargs)
        eval_manager.log("rag", ret.content)
        # ret.raw_output.source_nodes
        return ret


try:
    # load indexes
    storage_context = StorageContext.from_defaults(persist_dir="./data/uber/march")
    march_index = load_index_from_storage(storage_context)
    storage_context = StorageContext.from_defaults(persist_dir="./data/uber/june")
    june_index = load_index_from_storage(storage_context)
    storage_context = StorageContext.from_defaults(persist_dir="./data/uber/sept")
    sept_index = load_index_from_storage(storage_context)
except:
    # build indexes across the three data sources
    march_docs = SimpleDirectoryReader(
        input_files=["./data/uber/uber_10q_march_2022.pdf"]
    ).load_data()
    june_docs = SimpleDirectoryReader(
        input_files=["./data/uber/uber_10q_june_2022.pdf"]
    ).load_data()
    sept_docs = SimpleDirectoryReader(
        input_files=["./data/uber/uber_10q_sept_2022.pdf"]
    ).load_data()
    # build index
    march_index = VectorStoreIndex.from_documents(march_docs)
    june_index = VectorStoreIndex.from_documents(june_docs)
    sept_index = VectorStoreIndex.from_documents(sept_docs)
    # persist index
    march_index.storage_context.persist(persist_dir="./data/uber/march")
    june_index.storage_context.persist(persist_dir="./data/uber/june")
    sept_index.storage_context.persist(persist_dir="./data/uber/sept")

march_engine = march_index.as_query_engine(similarity_top_k=3)
june_engine = june_index.as_query_engine(similarity_top_k=3)
sept_engine = sept_index.as_query_engine(similarity_top_k=3)

query_engine_tools = [
    LoggableQueryEngineTool(
        query_engine=march_engine,
        metadata=ToolMetadata(
            name="uber_march_10q",
            description=(
                "Provides information about Uber 10Q filings for March 2022. "
                "Use a detailed plain text question as input to the tool."
            ),
        ),
    ),
    LoggableQueryEngineTool(
        query_engine=june_engine,
        metadata=ToolMetadata(
            name="uber_june_10q",
            description=(
                "Provides information about Uber financials for June 2021. "
                "Use a detailed plain text question as input to the tool."
            ),
        ),
    ),
    LoggableQueryEngineTool(
        query_engine=sept_engine,
        metadata=ToolMetadata(
            name="uber_sept_10q",
            description=(
                "Provides information about Uber financials for Sept 2021. "
                "Use a detailed plain text question as input to the tool."
            ),
        ),
    ),
]

texts = [
    "Abbreviation: FINRA (Financial Industry Regulatory Authority)",
]
docs = [Document(text=t) for t in texts]
context_index = VectorStoreIndex.from_documents(docs)

context_agent = ContextRetrieverOpenAIAgent.from_tools_and_retriever(
    tools=query_engine_tools,
    retriever=context_index.as_retriever(),
    verbose=VERBOSE,
)


def ask(query: str):
    response = context_agent.chat(query)
    eval_manager.log("answer", response.response)
    return response


if __name__ == "__main__":
    eval_manager.start_run()
    while eval_manager.is_running():
        if eval_manager.curr_sample is None:
            break
        response = ask(eval_manager.curr_sample["question"])
        print(response)
        eval_manager.next_sample()

    print(eval_manager.samples)

from pathlib import Path

from continuous_eval.eval.manager import eval_manager
from examples.llama_index.simple_tools.pipeline import pipeline

if __name__ == "__main__":
    eval_manager.set_pipeline(pipeline)

    # Evaluation
    eval_manager.evaluation.load(Path("results.jsonl"))
    eval_manager.run_metrics()
    eval_manager.metrics.save(Path("metrics_results.json"))

    # Tests
    eval_manager.metrics.load(Path("metrics_results.json"))
    agg = eval_manager.metrics.aggregate()
    print(agg)
    eval_manager.run_tests()
    eval_manager.tests.save(Path("test_results.json"))

    eval_manager.tests.load(Path("test_results.json"))
    for module_name, test_results in eval_manager.tests.results.items():
        print(f"{module_name}")
        for test_name in test_results:
            print(f" - {test_name}: {test_results[test_name]}")
    print("Done")