import os
import json
import base64
import random
import argparse

import natsort

from PIL import Image
from tqdm import tqdm

import torch
from torch.utils.data import Dataset, DataLoader

from src.run_gpt import run_gpt

random.seed(10)
dict_api = {
    "api_key":"ADD",
}


class CustomDatasetGPT(Dataset):
    def __init__(self, questions, num_kf):
        self.questions = questions
        self.num_kf = num_kf

    def __getitem__(self, index):
        line = self.questions[index]
        group = 4
        newnum_per_group = self.num_kf // group
        oldnum_per_group = len(line["VLM_path"]) // group
        assert oldnum_per_group >= newnum_per_group, f"oldnum_per_group:{oldnum_per_group} is smaller than newnum_per_group:{newnum_per_group}"

        new_kf_paths = []
        new_kf_timelines = []
        for i in range(group):
            start_index = i * oldnum_per_group
            end_index = start_index + oldnum_per_group

            sub_kf_paths = line["VLM_path"][start_index:min(end_index, len(line["VLM_path"]))]
            sub_kf_timelines = line["VLM_timeline"][start_index:min(end_index, len(line["VLM_timeline"]))]
            new_kf_paths.extend(sub_kf_paths[:newnum_per_group])
            new_kf_timelines.extend(sub_kf_timelines[:newnum_per_group])

        kf_paths = natsort.natsorted(new_kf_paths)
        kf_timelines = natsort.natsorted(new_kf_timelines)

        images = []
        images_base64 = []

        for e in kf_paths:
            images.append(Image.open(e).convert('RGB'))
            images_base64.append(encode_image(e))

        return images_base64, kf_paths, kf_timelines

    def __len__(self):
        return len(self.questions)


def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def create_data_loader_gpt(questions, num_kf, batch_size=1, num_workers=4):
    assert batch_size == 1, "batch_size must be 1"

    dataset = CustomDatasetGPT(questions, num_kf)
    data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)

    return data_loader, dataset

def eval_model(args):
    base_dir, question_path, vlm, num_kf, temp = (
        args.output_dir,
        args.question_path,
        args.gptmodel,
        args.num_kf,
        args.temp,
    )

    questions = [json.loads(q) for q in open(os.path.expanduser(question_path), "r")]

    fname = question_path.split('/')[-1]
    answer_path = f"{base_dir}/egoschema/{num_kf}/{fname}"
    os.makedirs(os.path.dirname(answer_path), exist_ok=True)
    print(f"question_path:{question_path}\nanswer_path:{answer_path}")

    ans_file = open(answer_path, "w")
    data_loader, dataset = create_data_loader_gpt(questions, num_kf)

    for (base64_image, kf_paths, kf_timelines), line in tqdm(zip(data_loader, questions), total=len(questions)):
        idx = line["q_uid"]
        CA = line["CA"] if "CA" in line else None
        option0 = line['option 0']
        option1 = line['option 1']
        option2 = line['option 2']
        option3 = line['option 3']
        option4 = line['option 4']
        question = line['question']

        lenwords = "50"
        prompt = f"'C' stands for the cameraman. Describe the activity depicted in this first-person perspective image in less than {lenwords} words. In your answer, don't mention that the image is in first-person perspective, as we already know this."
        prompts = [prompt] * num_kf
        
        image_paths = [e[0] for e in kf_paths]
        image_timelines = [e[0] for e in kf_timelines]

        output_VLM = run_gpt(
                images=image_paths,
                texts=prompts,
                api_keys=list(dict_api.values()),
                max_tokens=2000,
                model=vlm,
                temperature=temp,
                num_threads=20,  # Tune this
                backoff_time=1 * 60,
                silent=False,
                dataset="egoschema",
                verbose=False,
        )

        output_VLM = list(output_VLM)

        for j, e in enumerate(image_timelines):
            line_frame = line.copy()
            line_frame["answer"] = f"At {str(e)} seconds, {output_VLM[j]}"
            line_frame["AR-VLM_model_id"] = vlm
            line_frame["AR-VLM_prompt"] = prompts[j]
            line_frame["timeline"] = float(e)
            line_frame["frame_idx"] = j
            line_frame["image_paths"] = image_paths

            if "imgidx_kw_dict" in line_frame.keys(): line_frame.pop("imgidx_kw_dict")
            if "google_drive_id" in line_frame.keys(): line_frame.pop("google_drive_id")

            ans_file.write(json.dumps(line_frame)+"\n")

    print(f"question.\nquestion_path:{question_path}\nanswer_path:{answer_path}")

    ans_file.close()
    return "job is done"


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--output-dir", type=str)
    parser.add_argument("--question-path", type=str, default="")
    parser.add_argument("--num-kf", type=int) 
    parser.add_argument("--gptmodel", type=str, default="gpt-4o")
    parser.add_argument("--temp", type=float, default=None)
    args = parser.parse_args()
    eval_model(args)