def f1_score_summary_evaluator(outputs: list[dict], reference_outputs: list[dict]) -> dict:
true_positives = 0
false_positives = 0
false_negatives = 0
for output_dict, reference_output_dict in zip(outputs, reference_outputs):
output = output_dict["class"]
reference_output = reference_output_dict["class"]
if output == "Toxic" and reference_output == "Toxic":
true_positives += 1
elif output == "Toxic" and reference_output == "Not toxic":
false_positives += 1
elif output == "Not toxic" and reference_output == "Toxic":
false_negatives += 1
if true_positives == 0:
return {"key": "f1_score", "score": 0.0}
precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
f1_score = 2 * (precision * recall) / (precision + recall)
return {"key": "f1_score", "score": f1_score}