Metric Ensembling

In this example, we show how to use user feedback data to create custom ensemble metrics.
from pathlib import Path

import numpy as np

from continuous_eval.classifiers import EnsembleMetric
from continuous_eval.classifiers.utils import eval_prediction
from continuous_eval.data_downloader import example_data_downloader
from continuous_eval.datatypes import DataSplit, SplitRatios
from continuous_eval.eval import Dataset, SingleModulePipeline
from continuous_eval.eval.manager import eval_manager
from continuous_eval.llm_factory import LLMFactory
from continuous_eval.metrics.generation.text import (
    DebertaAnswerScores,
    DeterministicAnswerCorrectness,
    FleschKincaidReadability,
    LLMBasedAnswerCorrectness,
)

# Download the correctness dataset and remove examples where the LLL refused to answer (i.e., said "I don't know")
dataset_jsonl = example_data_downloader("correctness")
dataset = Dataset(dataset_jsonl)
dataset.filter(lambda x: x["annotation"] != "refuse-to-answer")

# Let's define the system under evaluation
# We use a single module pipeline to evaluate the correctness of the answers
# using the DeterministicAnswerCorrectness metric, the DebertaAnswerScores metric, and the FleschKincaidReadability metric
# Attention: the DebertaAnswerScores metric requires the DeBERTa model (slow on CPU)
pipeline = SingleModulePipeline(
    dataset=dataset,
    eval=[
        DeterministicAnswerCorrectness().use(
            answer=dataset.answer, ground_truth_answers=dataset.ground_truths  # type: ignore
        ),
        DebertaAnswerScores().use(answer=dataset.answer, ground_truth_answers=dataset.ground_truths),  # type: ignore
        FleschKincaidReadability().use(answer=dataset.answer),  # type: ignore
    ],
)

# We start the evaluation manager and run the metrics
eval_manager.set_pipeline(pipeline)
eval_manager.evaluation.results = dataset.data
eval_manager.run_metrics()
eval_manager.metrics.save(Path("metrics_results.json"))

# Now we building the data for the ensemble classifier
# X is the input the classifier can use
# y is the target the classifier should predict (1 for correct, 0 for incorrect)
X = eval_manager.metrics.to_pandas()
y = map(lambda x: 1 if x == "correct" else 0, dataset["annotation"])

# We split the data into train, test, and calibration sets
# We also specify the features we want to use for the classifier
# We also oversample the train set to balance the classes
datasplit = DataSplit(
    X=X,
    y=y,
    split_ratios=SplitRatios(train=0.6, test=0.2, calibration=0.2),
    features=[
        "token_overlap_recall",
        "deberta_answer_entailment",
        "deberta_answer_contradiction",
        "flesch_reading_ease",
    ],
    oversample=True,
)

# We use the train and calibration sets to train the classifier
predictor = EnsembleMetric(training=datasplit.train, calibration=datasplit.calibration)

# We then use the test set to evaluate the classifier
print("Running predictor (without judicator)")
y_hat, y_set = predictor.predict(datasplit.test.X)
num_undecided = np.sum(np.all(y_set, axis=1))
print(eval_prediction(datasplit.test.y, y_hat))
print(f"Undecided: {num_undecided} ({num_undecided/len(y_set):.2%})")

#######################################################################################
# Optional, use a judicator to resolve undecided examples
# Attention: the LLM model can be slow
#######################################################################################

print("\nRunning predictor (with judicator)")
llm_metric = LLMBasedAnswerCorrectness(LLMFactory("gpt-4-1106-preview"))


def judicator(idx):
    # The judicator receives the index of the example in the test set where the classifier is undecided
    # and in this case, since we are computing the correctness of the sample,
    # it returns True if the example is correct and False otherwise
    datum = dataset.data[idx]
    metric_result = llm_metric(
        question=datum["question"],
        answer=datum["answer"],
        ground_truth_answers=datum["ground_truths"],
    )
    return metric_result["LLM_based_answer_correctness"] >= 0.5


y_hat, _ = predictor.predict(datasplit.test.X, judicator=judicator)
print(eval_prediction(datasplit.test.y, y_hat))