Metric Ensembling
In this example, we show how to use user feedback data to create custom ensemble metrics.
from pathlib import Path
import numpy as np
from continuous_eval.classifiers import EnsembleMetricfrom continuous_eval.classifiers.utils import eval_predictionfrom continuous_eval.data_downloader import example_data_downloaderfrom continuous_eval.datatypes import DataSplit, SplitRatiosfrom continuous_eval.eval import Dataset, SingleModulePipelinefrom continuous_eval.eval.manager import eval_managerfrom continuous_eval.llm_factory import LLMFactoryfrom continuous_eval.metrics.generation.text import ( DebertaAnswerScores, DeterministicAnswerCorrectness, FleschKincaidReadability, LLMBasedAnswerCorrectness,)
# Download the correctness dataset and remove examples where the LLL refused to answer (i.e., said "I don't know")dataset_jsonl = example_data_downloader("correctness")dataset = Dataset(dataset_jsonl)dataset.filter(lambda x: x["annotation"] != "refuse-to-answer")
# Let's define the system under evaluation# We use a single module pipeline to evaluate the correctness of the answers# using the DeterministicAnswerCorrectness metric, the DebertaAnswerScores metric, and the FleschKincaidReadability metric# Attention: the DebertaAnswerScores metric requires the DeBERTa model (slow on CPU)pipeline = SingleModulePipeline( dataset=dataset, eval=[ DeterministicAnswerCorrectness().use( answer=dataset.answer, ground_truth_answers=dataset.ground_truths # type: ignore ), DebertaAnswerScores().use(answer=dataset.answer, ground_truth_answers=dataset.ground_truths), # type: ignore FleschKincaidReadability().use(answer=dataset.answer), # type: ignore ],)
# We start the evaluation manager and run the metricseval_manager.set_pipeline(pipeline)eval_manager.evaluation.results = dataset.dataeval_manager.run_metrics()eval_manager.metrics.save(Path("metrics_results.json"))
# Now we building the data for the ensemble classifier# X is the input the classifier can use# y is the target the classifier should predict (1 for correct, 0 for incorrect)X = eval_manager.metrics.to_pandas()y = map(lambda x: 1 if x == "correct" else 0, dataset["annotation"])
# We split the data into train, test, and calibration sets# We also specify the features we want to use for the classifier# We also oversample the train set to balance the classesdatasplit = DataSplit( X=X, y=y, split_ratios=SplitRatios(train=0.6, test=0.2, calibration=0.2), features=[ "token_overlap_recall", "deberta_answer_entailment", "deberta_answer_contradiction", "flesch_reading_ease", ], oversample=True,)
# We use the train and calibration sets to train the classifierpredictor = EnsembleMetric(training=datasplit.train, calibration=datasplit.calibration)
# We then use the test set to evaluate the classifierprint("Running predictor (without judicator)")y_hat, y_set = predictor.predict(datasplit.test.X)num_undecided = np.sum(np.all(y_set, axis=1))print(eval_prediction(datasplit.test.y, y_hat))print(f"Undecided: {num_undecided} ({num_undecided/len(y_set):.2%})")
######################################################################################## Optional, use a judicator to resolve undecided examples# Attention: the LLM model can be slow#######################################################################################
print("\nRunning predictor (with judicator)")llm_metric = LLMBasedAnswerCorrectness(LLMFactory("gpt-4-1106-preview"))
def judicator(idx): # The judicator receives the index of the example in the test set where the classifier is undecided # and in this case, since we are computing the correctness of the sample, # it returns True if the example is correct and False otherwise datum = dataset.data[idx] metric_result = llm_metric( question=datum["question"], answer=datum["answer"], ground_truth_answers=datum["ground_truths"], ) return metric_result["LLM_based_answer_correctness"] >= 0.5
y_hat, _ = predictor.predict(datasplit.test.X, judicator=judicator)print(eval_prediction(datasplit.test.y, y_hat))