Context Agent evaluation
In this example, we will walk through a Context Augmentation RAG Agent application and evaluate it’s performance.
Try out full example at example repo.
from continuous_eval.eval import Module, AgentModule, Pipeline, Tool, Dataset, ModuleOutputfrom continuous_eval.metrics.generation.text import DeterministicAnswerCorrectnessfrom continuous_eval.metrics.tools.match import ToolSelectionAccuracyfrom continuous_eval.eval.tests import GreaterOrEqualThan
dataset = Dataset("examples/llama_index/context_augmentation/data")
tools = [ Tool( name="march", args={"input": str}, out_type=str, ), Tool( name="june", args={"input": str}, out_type=str, ), Tool( name="sept", args={"input": str}, out_type=str, ),]
agent = AgentModule( name="retriever_agent", input=dataset.question, output=str, eval=[ ToolSelectionAccuracy().use( tools=tools, ground_truths=dataset.tool_calls ), ], tests=[ GreaterOrEqualThan( test_name="Tool Selection Accuracy", metric_name="score", min_value=0.8 ), ],)
output = Module( name="answer", input=agent, output=str, eval=[ DeterministicAnswerCorrectness().use(answer=ModuleOutput(), ground_truth=dataset.answer), ], tests=[ GreaterOrEqualThan( test_name="Answer Correctness", metric_name="rouge_l_recall", min_value=0.8 ), ],)
pipeline = Pipeline([agent, output], dataset=dataset)
print(pipeline.graph_repr())
name: Uber 10Qdescription: Uber 10Q filings from 2022format: jsonllicense: CC0fields: uuid: description: Unique identifier for the filing type: UUID question: description: The question asked in the filing type: str answer: description: The answer to the question type: List[str] tool_calls: description: The tools used to extract the question and answer type: List[Dict[str, str]]
from typing import Any
from llama_index.agent.openai_legacy import ContextRetrieverOpenAIAgentfrom llama_index.core import ( Document, SimpleDirectoryReader, StorageContext, VectorStoreIndex, load_index_from_storage,)from llama_index.core.tools import QueryEngineTool, ToolMetadatafrom llama_index.core.tools.types import ToolMetadata, ToolOutputfrom loguru import logger
from continuous_eval.eval.manager import eval_managerfrom examples.llama_index.context_augmentation.pipeline import dataset, pipeline
eval_manager.set_pipeline(pipeline)
VERBOSE = False
class LoggableQueryEngineTool(QueryEngineTool): def call(self, *args: Any, **kwargs: Any) -> ToolOutput: logger.info( f"Calling {self.metadata.name} with args: {args} and kwargs: {kwargs}" ) ret = super().call(*args, **kwargs) eval_manager.log("rag", ret.content) # ret.raw_output.source_nodes return ret
try: # load indexes storage_context = StorageContext.from_defaults(persist_dir="./data/uber/march") march_index = load_index_from_storage(storage_context) storage_context = StorageContext.from_defaults(persist_dir="./data/uber/june") june_index = load_index_from_storage(storage_context) storage_context = StorageContext.from_defaults(persist_dir="./data/uber/sept") sept_index = load_index_from_storage(storage_context)except: # build indexes across the three data sources march_docs = SimpleDirectoryReader( input_files=["./data/uber/uber_10q_march_2022.pdf"] ).load_data() june_docs = SimpleDirectoryReader( input_files=["./data/uber/uber_10q_june_2022.pdf"] ).load_data() sept_docs = SimpleDirectoryReader( input_files=["./data/uber/uber_10q_sept_2022.pdf"] ).load_data() # build index march_index = VectorStoreIndex.from_documents(march_docs) june_index = VectorStoreIndex.from_documents(june_docs) sept_index = VectorStoreIndex.from_documents(sept_docs) # persist index march_index.storage_context.persist(persist_dir="./data/uber/march") june_index.storage_context.persist(persist_dir="./data/uber/june") sept_index.storage_context.persist(persist_dir="./data/uber/sept")
march_engine = march_index.as_query_engine(similarity_top_k=3)june_engine = june_index.as_query_engine(similarity_top_k=3)sept_engine = sept_index.as_query_engine(similarity_top_k=3)
query_engine_tools = [ LoggableQueryEngineTool( query_engine=march_engine, metadata=ToolMetadata( name="uber_march_10q", description=( "Provides information about Uber 10Q filings for March 2022. " "Use a detailed plain text question as input to the tool." ), ), ), LoggableQueryEngineTool( query_engine=june_engine, metadata=ToolMetadata( name="uber_june_10q", description=( "Provides information about Uber financials for June 2021. " "Use a detailed plain text question as input to the tool." ), ), ), LoggableQueryEngineTool( query_engine=sept_engine, metadata=ToolMetadata( name="uber_sept_10q", description=( "Provides information about Uber financials for Sept 2021. " "Use a detailed plain text question as input to the tool." ), ), ),]
texts = [ "Abbreviation: FINRA (Financial Industry Regulatory Authority)",]docs = [Document(text=t) for t in texts]context_index = VectorStoreIndex.from_documents(docs)
context_agent = ContextRetrieverOpenAIAgent.from_tools_and_retriever( tools=query_engine_tools, retriever=context_index.as_retriever(), verbose=VERBOSE,)
def ask(query: str): response = context_agent.chat(query) eval_manager.log("answer", response.response) return response
if __name__ == "__main__": eval_manager.start_run() while eval_manager.is_running(): if eval_manager.curr_sample is None: break response = ask(eval_manager.curr_sample["question"]) print(response) eval_manager.next_sample()
print(eval_manager.samples)
from pathlib import Path
from continuous_eval.eval.manager import eval_managerfrom examples.llama_index.simple_tools.pipeline import pipeline
if __name__ == "__main__": eval_manager.set_pipeline(pipeline)
# Evaluation eval_manager.evaluation.load(Path("results.jsonl")) eval_manager.run_metrics() eval_manager.metrics.save(Path("metrics_results.json"))
# Tests eval_manager.metrics.load(Path("metrics_results.json")) agg = eval_manager.metrics.aggregate() print(agg) eval_manager.run_tests() eval_manager.tests.save(Path("test_results.json"))
eval_manager.tests.load(Path("test_results.json")) for module_name, test_results in eval_manager.tests.results.items(): print(f"{module_name}") for test_name in test_results: print(f" - {test_name}: {test_results[test_name]}") print("Done")