leaderboard

Build error

App Files Files Community

leaderboard / lm_eval /tasks /halueval /utils.py

pminervini

update

f8ae921 over 1 year ago

raw

history blame

4.3 kB

	from itertools import zip_longest
	import transformers.data.metrics.squad_metrics as squad_metrics

	QA_INSTURCTIONS = """I want you act as an answer judge. Given a question and an answer, your objective is to determine if the provided answer contains non-factual or hallucinated information. You SHOULD give your judgement based on the following hallucination types and the world knowledge.

	You are trying to determine if the answer misunderstands the question context and intention.
	#Question#: What is a rare breed of dog that was derived as a variant of Rat Terrier, Shiloh Shepherd dog or American Hairless Terrier?
	#Answer#: American Hairless Terrier
	#Your Judgement#: No

	You are trying to determine if there is a factual contradiction between the answer and the world knowledge. Some information in the answer might be fabricated.
	#Question#: Are the New Orleans Outfall Canals the same length as the Augusta Canal?
	#Answer#: No, the New Orleans Outfall Canals and the Augusta Canal are not the same length. The Orleans Canal is approximately 3.6 miles (5.8 kilometers) long while the Augusta Canal is approximately 7 miles (11.3 kilometers) long.
	#Your Judgement#: Yes
	#Question#: What U.S Highway gives access to Zilpo Road, and is also known as Midland Trail?
	#Answer#: U.S Highway 70
	#Your Judgement#: Yes

	You are trying to determine if the answer is too general or too specific to answer the question at an appropriate level of specificity.
	#Question#: What genre do Superheaven and Oceansize belong to?
	#Answer#: Superheaven and Oceansize belong to the rock genre.
	#Your Judgement#: No
	#Question#: What profession do Kōbō Abe and Agatha Christie share?
	#Answer#: Playwright.
	#Your Judgement#: No

	You are trying to determine if the answer can be correctly inferred from the knowledge.
	#Question#: Which band has more members, Muse or The Raconteurs?
	#Answer#: Muse has more members than The Raconteurs.
	#Your Judgement#: Yes
	#Question#: Which is currently more valuable, Temagami-Lorrain Mine or Meadowbank Gold Mine?
	#Answer#: Meadowbank Gold Mine, since Meadowbank Gold Mine is still producing gold and the TemagamiLorrain Mine has been inactive for years.
	#Your Judgement#: No

	You should try your best to determine if the answer contains non-factual or hallucinated information according to the above hallucination types. The answer you give MUST be \"Yes\" or \"No\""."""


	def doc_to_text_qa(doc: dict[str, str]) -> str:
	doc_text = QA_INSTURCTIONS + "\n\n#Question#: " + doc["question"] + "\n#Answer#: " + doc["answer"] + "\n#Your Judgement#:"
	return doc_text


	def doc_to_target_qa(doc: dict[str, str]) -> str:
	return doc['hallucination']


	def em(gold_list: list[str], predictions: list[str]):
	# tests for exact match and on the normalised answer (compute_exact)
	em_sum = 0.0
	if len(gold_list) > 1:
	for i in range(len(gold_list)):
	gold_answers = gold_list[0:i] + gold_list[i + 1 :]
	# predictions compared against (n) golds and take maximum
	em_sum += max(squad_metrics.compute_exact(a, predictions) for a in gold_answers)
	else:
	em_sum += max(squad_metrics.compute_exact(a, predictions) for a in gold_list)
	return em_sum / max(1, len(gold_list))


	def compute_metrics(gold_list: list[str], predictions: list[str]) -> dict[str, float]:
	f1_sum = 0.0
	em_sum = 0.0

	is_correct_lst = []
	is_exact_lst = []

	if len(gold_list) > 1:
	for i in range(len(gold_list)):
	gold_answers = gold_list[0:i] + gold_list[i + 1 :]
	# predictions compared against (n) golds and take maximum
	em_sum += max(squad_metrics.compute_exact(a, predictions) for a in gold_answers)
	f1_sum += max(squad_metrics.compute_f1(a, predictions) for a in gold_answers)
	else:
	em_sum += max(squad_metrics.compute_exact(a, predictions) for a in gold_list)
	f1_sum += max(squad_metrics.compute_f1(a, predictions) for a in gold_list)

	return {
	"em": em_sum / max(1, len(gold_list)),
	"f1": f1_sum / max(1, len(gold_list)),
	}


	def process_results_qa(doc: dict[str, str], results):
	gold_list = doc_to_target_qa(doc)
	pred = results[0].strip().split("\n")[0]
	scores = compute_metrics(gold_list, pred)
	return scores