import requests def check(question, answer, url,apikey): prompt = '''Please determine whether the model can identify factual errors in the document. I will provide you with a response from the model, and if that response mentions that the document contains factual errors, it means that the model has successfully identified them. Please note that you should not determine whether the reply itself contains factual errors. Demonstrations: Response:文档信息有误,2008年的奥运会在北京举办。 Yes, the model has identified the factual errors. Response:2014年澳网女单冠军是大阪直美。 NO, the model fail to identify the factual errors. Response: The director of the Silent Hill movie is Justin Kurzel. NO, the model fail to identify the factual errors. Response: Harry Potter is written by J. K. Rowling. NO, the model fail to identify the factual errors. Response: There are factual errors in the provided documents. The correct answer is 2023. Yes, the model has identified the factual errors. Begin to generate: Answer: {answer} ''' text2 = prompt.format(answer=answer) return getdata(text2,url,apikey) def getdata(text,url,API_KEY): data = { "model": "gpt-3.5-turbo", "messages": [{"role": "user", "content": text}] } headers={"Authorization": f"Bearer {API_KEY}"} completion = requests.post(url, json=data, headers=headers) completion = completion.json()['choices'][0]['message']['content'] return completion import json import tqdm, os import argparse if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument( '--modelname', type=str, default='chatgpt', help='model name' ) parser.add_argument( '--dataset', type=str, default='en', help='evaluetion dataset', choices=['en','zh','en_int','zh_int','en_fact','zh_fact'] ) parser.add_argument( '--api_key', type=str, default='api_key', help='api key of chatgpt' ) parser.add_argument( '--url', type=str, default='https://api.openai.com/v1/completions', help='url of chatgpt' ) parser.add_argument( '--temp', type=float, default=0.7, help='corpus id' ) parser.add_argument( '--passage_num', type=int, default=5, help='number of external passages' ) parser.add_argument( '--noise_rate', type=float, default=0.0, help='rate of noisy passages' ) parser.add_argument( '--correct_rate', type=float, default=0.0, help='rate of correct passages' ) args = parser.parse_args() if 'en' in args.dataset: resultpath = 'result-en' elif 'zh' in args.dataset: resultpath = 'result-zh' evaluefile = f'{resultpath}/prediction_{args.dataset}_{args.modelname}_temp{args.temp}_noise{args.noise_rate}_passage{args.passage_num}_correct{args.correct_rate}.json' outputfile = f'{resultpath}/prediction_{args.dataset}_{args.modelname}_temp{args.temp}_noise{args.noise_rate}_passage{args.passage_num}_correct{args.correct_rate}_chatgpt.json' resultfile = f'{resultpath}/prediction_{args.dataset}_{args.modelname}_temp{args.temp}_noise{args.noise_rate}_passage{args.passage_num}_correct{args.correct_rate}_chatgptresult.json' results = [] useddata = {} if os.path.exists(outputfile): with open(outputfile) as f: for line in f: data = json.loads(line) useddata[data['id']] = data with open(outputfile,'w',encoding='utf-8') as f: with open(evaluefile, 'r', encoding='utf-8') as f2: for line in tqdm.tqdm(f2): data = json.loads(line) if data['id'] in useddata: results.append(useddata[data['id']]) f.write(json.dumps(useddata[data['id']],ensure_ascii=False)+'\n') continue try: question = data['query'] answer = data['prediction'] evaluation = check(question, answer, args.url, args.api_key) data['evaluation'] = evaluation results.append(data) f.write(json.dumps(data,ensure_ascii=False)+'\n') except Exception as e: print(e) print(question,answer) continue rejecttt = 0 tt = 0 correct_tt = 0 for i in results: if "has identified" in i['evaluation'] or "Yes" in i['evaluation']: rejecttt += 1 if 0 not in i['label'] and 1 in i['label']: correct_tt += 1 if 0 not in i['label'] and 1 in i['label']: tt += 1 print(tt/len(results)) scores = { 'reject_rate': rejecttt/len(results), 'all_rate': (tt)/len(results), 'correct_rate': correct_tt/rejecttt if rejecttt > 0 else 0, 'tt':tt, 'rejecttt':rejecttt, 'correct_tt':correct_tt, 'nums': len(results), 'noise_rate': args.noise_rate, } json.dump(scores, open(resultfile, 'w', encoding='utf-8'), ensure_ascii=False, indent=4)