|
import argparse |
|
import json |
|
import os |
|
|
|
from tqdm import tqdm |
|
from src.run_gpt import run_gpt |
|
|
|
""" |
|
Extract keywords from the given question and options |
|
|
|
Sample Run |
|
python3 extractKeyword.py --output-dir ego_base_link --question questions/500questions.jsonl --gptmodel "gpt-4-1106-preview" |
|
|
|
""" |
|
|
|
|
|
|
|
dict_api = { |
|
"api_key": "ADD", |
|
} |
|
|
|
PROMPT = ( |
|
"Think step-by-step and for each option, identify all the specified activities. " |
|
"Each description of activity should use active voice with plain verbs, contain fewer than six words, " |
|
"and retains as many original terms from the options as possible.\n" |
|
"Here are the options:\n\n" |
|
"option 0: {Putop0}\n" |
|
"option 1: {Putop1}\n" |
|
"option 2: {Putop2}\n" |
|
"option 3: {Putop3}\n" |
|
"option 4: {Putop4}\n" |
|
"option 5: {Putquestion}.\n" |
|
"All the options were introduced. 'C' represents the camera operator in the options. " |
|
"Your answer should follow the JSON format shown below and should only include the JSON result. " |
|
"Do not output any warnings or notes under any circumstances. Instead, adhere strictly to the provided JSON format example.\n" |
|
"This is one example output format.\n" |
|
"{\"option 0\": [\"plays soccer\", \"go to school\"], \"option 1\": [\"go to the gym\", \"go to school\"], " |
|
"\"option 2\": [\"go to school\", \"dry hair\"], \"option 3\": [\"plays basketball\", \"look at the tree\"], " |
|
"\"option 4\": [\"plays soccer\", \"drop the ball\"], \"option 5\": [\"turn the table\", \"go to school\"]}" |
|
) |
|
|
|
|
|
def main(args): |
|
|
|
os.makedirs(args.output_dir, exist_ok=True) |
|
job_dir = os.path.join(args.output_dir, "extractedKeywords") |
|
os.makedirs(job_dir, exist_ok=True) |
|
|
|
|
|
|
|
question_file_name = os.path.basename(args.question).replace(".jsonl", "") |
|
output_summary_path = os.path.join(job_dir, f"{question_file_name}.jsonl") |
|
print(f"Saving outputs to: {output_summary_path}") |
|
|
|
|
|
with open(os.path.expanduser(args.question), "r") as f: |
|
question_data = [json.loads(line) for line in f] |
|
|
|
|
|
final_prompts = [] |
|
final_info = [] |
|
for entry in tqdm(question_data, desc="Building prompts"): |
|
q_uid = entry["q_uid"] |
|
|
|
cur_prompt = ( |
|
PROMPT |
|
.replace("{Putop0}", entry["option 0"]) |
|
.replace("{Putop1}", entry["option 1"]) |
|
.replace("{Putop2}", entry["option 2"]) |
|
.replace("{Putop3}", entry["option 3"]) |
|
.replace("{Putop4}", entry["option 4"]) |
|
.replace("{Putquestion}", entry["question"]) |
|
) |
|
|
|
final_prompts.append(cur_prompt) |
|
|
|
|
|
info = { |
|
"q_uid": q_uid, |
|
"prompt": cur_prompt, |
|
"option 0": entry["option 0"], |
|
"option 1": entry["option 1"], |
|
"option 2": entry["option 2"], |
|
"option 3": entry["option 3"], |
|
"option 4": entry["option 4"], |
|
"question": entry["question"], |
|
} |
|
|
|
|
|
if "CA" in entry: |
|
info["CA"] = entry["CA"] |
|
|
|
final_info.append(info) |
|
|
|
|
|
print("Sending prompts to GPT. This may take a while...") |
|
output_results = run_gpt( |
|
texts=final_prompts, |
|
api_keys=list(dict_api.values()), |
|
max_tokens=2000, |
|
model=args.gptmodel, |
|
temperature=args.temperature, |
|
num_threads=5, |
|
backoff_time=60, |
|
silent=False, |
|
dataset="extractKeyword", |
|
) |
|
|
|
output_results = list(output_results) |
|
|
|
|
|
with open(output_summary_path, "w") as outfile: |
|
for i, info in enumerate(final_info): |
|
info["answer"] = output_results[i] |
|
outfile.write(json.dumps(info) + "\n") |
|
|
|
print(f"Done! Results written to {output_summary_path}") |
|
|
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--output-dir", type=str, required=True, |
|
help="Directory to store the resulting JSONL file.") |
|
parser.add_argument("--question", type=str, required=True, |
|
help="Path to the JSONL file with question data (e.g., 500questions.jsonl).") |
|
parser.add_argument("--gptmodel", type=str, default="gpt-4-1106-preview", |
|
help="The GPT model to call.") |
|
parser.add_argument("--temperature", type=float, default=None, |
|
help="Temperature parameter for GPT.") |
|
|
|
args = parser.parse_args() |
|
main(args) |
|
|