import torch from torch.utils.data import Dataset import pandas as pd import numpy as np import tqdm import random from .vocab import Vocab import pickle import copy import os class TokenizerDataset(Dataset): """ Class name: TokenizerDataset Tokenize the data in the dataset Feat length: 17 """ def __init__(self, dataset_path, label_path, vocab, seq_len=30): self.dataset_path = dataset_path self.label_path = label_path self.vocab = vocab # Vocab object # Related to input dataset file self.lines = [] self.labels = [] self.feats = [] if self.label_path: self.label_file = open(self.label_path, "r") for line in self.label_file: if line: line = line.strip() if not line: continue self.labels.append(int(line)) self.label_file.close() # Comment this section if you are not using feat attribute try: j = 0 dataset_info_file = open(self.label_path.replace("label", "info"), "r") for line in dataset_info_file: if line: line = line.strip() if not line: continue # # highGRschool_w_prior # feat_vec = [float(i) for i in line.split(",")[-3].split("\t")] # highGRschool_w_prior_w_diffskill_wo_fa feat_vec = [float(i) for i in line.split(",")[-3].split("\t")] feat2 = [float(i) for i in line.split(",")[-2].split("\t")] feat_vec.extend(feat2[1:]) if j == 0: print(len(feat_vec)) j+=1 self.feats.append(feat_vec) dataset_info_file.close() except Exception as e: print(e) self.file = open(self.dataset_path, "r") for line in self.file: if line: line = line.strip() if line: self.lines.append(line) self.file.close() self.len = len(self.lines) self.seq_len = seq_len print("Sequence length set at ", self.seq_len, len(self.lines), len(self.labels) if self.label_path else 0) def __len__(self): return self.len def __getitem__(self, item): org_line = self.lines[item].split("\t") dup_line = [] opt = False for l in org_line: if l in ["OptionalTask_1", "EquationAnswer", "NumeratorFactor", "DenominatorFactor", "OptionalTask_2", "FirstRow1:1", "FirstRow1:2", "FirstRow2:1", "FirstRow2:2", "SecondRow", "ThirdRow"]: opt = True if opt and 'FinalAnswer-' in l: dup_line.append('[UNK]') else: dup_line.append(l) dup_line = "\t".join(dup_line) # print(dup_line) s1 = self.vocab.to_seq(dup_line, self.seq_len) # This is like tokenizer and adds [CLS] and [SEP]. s1_label = self.labels[item] if self.label_path else 0 segment_label = [1 for _ in range(len(s1))] s1_feat = self.feats[item] if len(self.feats)>0 else 0 padding = [self.vocab.vocab['[PAD]'] for _ in range(self.seq_len - len(s1))] s1.extend(padding), segment_label.extend(padding) output = {'input': s1, 'label': s1_label, 'feat': s1_feat, 'segment_label': segment_label} return {key: torch.tensor(value) for key, value in output.items()} class TokenizerwSkillsDataset(Dataset): """ Feature length: 17 """ def __init__(self, dataset_path, label_path, vocab, seq_len=30): print(f"dataset_path: {dataset_path}") print(f"label_path: {label_path}") self.dataset_path = dataset_path self.label_path = label_path self.vocab = vocab # Vocab object self.seq_len = seq_len # Related to input dataset file self.lines = [] self.labels = [] self.feats = [] selected_lines = [] print("TokenizerwSkillsDataset...............................") if self.label_path: # Comment this section if you are not using feat attribute dataset_info_file = open(self.label_path.replace("label", "info"), "r").readlines() print(">>>>>>>>>>>>>>>>>", len(dataset_info_file)) j = 0 for idex, line in enumerate(dataset_info_file): try: if line: line = line.strip() if not line: continue feat_vec = [float(i) for i in line.split(",")[-9].split("\t")] feat2 = [float(i) for i in line.split(",")[-8].split("\t")] feat_vec.extend(feat2[1:]) if j == 0: print(";;;;", len(feat_vec), feat_vec) j+=1 self.feats.append(feat_vec) selected_lines.append(idex) except Exception as e: print("................>") print(e) print("Error at index: ", idex) self.label_file = open(self.label_path, "r") for idex, line in enumerate(self.label_file): if line: line = line.strip() if not line: continue if idex in selected_lines: self.labels.append(int(line)) # self.labels.append(int(line)) self.label_file.close() self.file = open(self.dataset_path, "r") for idex, line in enumerate(self.file): if line: line = line.strip() if line: if idex in selected_lines: self.lines.append(line) # self.lines.append(line) self.file.close() self.len = len(self.lines) print("Sequence length set at ", self.seq_len, len(self.lines), len(self.labels) if self.label_path else 0) def __len__(self): return self.len def __getitem__(self, item): org_line = self.lines[item].split("\t") dup_line = [] opt = False for l in org_line: if l in ["OptionalTask_1", "EquationAnswer", "NumeratorFactor", "DenominatorFactor", "OptionalTask_2", "FirstRow1:1", "FirstRow1:2", "FirstRow2:1", "FirstRow2:2", "SecondRow", "ThirdRow"]: opt = True if opt and 'FinalAnswer-' in l: dup_line.append('[UNK]') else: dup_line.append(l) dup_line = "\t".join(dup_line) # print(dup_line) s1 = self.vocab.to_seq(dup_line, self.seq_len) # This is like tokenizer and adds [CLS] and [SEP]. s1_label = self.labels[item] if self.label_path else 0 segment_label = [1 for _ in range(len(s1))] s1_feat = self.feats[item] if len(self.feats)>0 else 0 padding = [self.vocab.vocab['[PAD]'] for _ in range(self.seq_len - len(s1))] s1.extend(padding), segment_label.extend(padding) # print(s1_feat) output = {'input': s1, 'label': s1_label, 'feat': s1_feat, 'segment_label': segment_label} return {key: torch.tensor(value) for key, value in output.items()} class TokenizerwTimeDataset(Dataset): """ Feature length: 4 """ def __init__(self, dataset_path, label_path, vocab, seq_len=30): print(f"dataset_path: {dataset_path}") print(f"label_path: {label_path}") self.dataset_path = dataset_path self.label_path = label_path self.vocab = vocab # Vocab object self.seq_len = seq_len # Related to input dataset file self.lines = [] self.labels = [] self.feats = [] selected_lines = [] print("TokenizerwTimeDataset...............................") time_df = pickle.load(open("ratio_proportion_change3_2223/sch_largest_100-coded/time_info/full_data_normalized_time.pkl", "rb")) print("time: ?? ", time_df.shape) if self.label_path: # Comment this section if you are not using feat attribute dataset_info_file = open(self.label_path.replace("label", "info"), "r").readlines() print(">>>>>>>>>>>>>>>>>", len(dataset_info_file)) j = 0 for idex, line in enumerate(dataset_info_file): try: if line: line = line.strip() if not line: continue feat_vec = [] sch = line.split(",")[0] stu = line.split(",")[2] progress = line.split(",")[3] prob_id = line.split(",")[4] total_time = time_df.loc[(sch, stu, progress, prob_id)]['total_time'].item() faopt_time = time_df.loc[(sch, stu, progress, prob_id)]['faopt_time'].item() opt_time = time_df.loc[(sch, stu, progress, prob_id)]['opt_time'].item() nonopt_time = time_df.loc[(sch, stu, progress, prob_id)]['nonopt_time'].item() feat_vec.append(faopt_time) feat_vec.append(total_time) feat_vec.append(opt_time) feat_vec.append(nonopt_time) if j == 0: print(";;;;", len(feat_vec), feat_vec) j+=1 self.feats.append(feat_vec) selected_lines.append(idex) except Exception as e: print("................>") print(e) print("Error at index: ", idex) self.label_file = open(self.label_path, "r") for idex, line in enumerate(self.label_file): if line: line = line.strip() if not line: continue if idex in selected_lines: self.labels.append(int(line)) # self.labels.append(int(line)) self.label_file.close() self.file = open(self.dataset_path, "r") for idex, line in enumerate(self.file): if line: line = line.strip() if line: if idex in selected_lines: self.lines.append(line) # self.lines.append(line) self.file.close() self.len = len(self.lines) print("Sequence length set at ", self.seq_len, len(self.lines), len(self.labels) if self.label_path else 0) def __len__(self): return self.len def __getitem__(self, item): org_line = self.lines[item].split("\t") dup_line = [] opt = False for l in org_line: if l in ["OptionalTask_1", "EquationAnswer", "NumeratorFactor", "DenominatorFactor", "OptionalTask_2", "FirstRow1:1", "FirstRow1:2", "FirstRow2:1", "FirstRow2:2", "SecondRow", "ThirdRow"]: opt = True if opt and 'FinalAnswer-' in l: dup_line.append('[UNK]') else: dup_line.append(l) dup_line = "\t".join(dup_line) # print(dup_line) s1 = self.vocab.to_seq(dup_line, self.seq_len) # This is like tokenizer and adds [CLS] and [SEP]. s1_label = self.labels[item] if self.label_path else 0 segment_label = [1 for _ in range(len(s1))] s1_feat = self.feats[item] if len(self.feats)>0 else 0 padding = [self.vocab.vocab['[PAD]'] for _ in range(self.seq_len - len(s1))] s1.extend(padding), segment_label.extend(padding) # print(s1_feat) output = {'input': s1, 'label': s1_label, 'feat': s1_feat, 'segment_label': segment_label} return {key: torch.tensor(value) for key, value in output.items()} class TokenizerwSkillsTimeDataset(Dataset): """ Feature length: 17+4 = 21 """ def __init__(self, dataset_path, label_path, vocab, seq_len=30): print(f"dataset_path: {dataset_path}") print(f"label_path: {label_path}") self.dataset_path = dataset_path self.label_path = label_path self.vocab = vocab # Vocab object self.seq_len = seq_len # Related to input dataset file self.lines = [] self.labels = [] self.feats = [] selected_lines = [] print("TokenizerwSkillsTimeDataset...............................") time_df = pickle.load(open("ratio_proportion_change3_2223/sch_largest_100-coded/time_info/full_data_normalized_time.pkl", "rb")) print("time: ", time_df.shape) if self.label_path: # Comment this section if you are not using feat attribute dataset_info_file = open(self.label_path.replace("label", "info"), "r").readlines() print(">>>>>>>>>>>>>>>>>", len(dataset_info_file)) j = 0 for idex, line in enumerate(dataset_info_file): try: if line: line = line.strip() if not line: continue feat_vec = [float(i) for i in line.split(",")[-9].split("\t")] feat2 = [float(i) for i in line.split(",")[-8].split("\t")] feat_vec.extend(feat2[1:]) sch = line.split(",")[0] stu = line.split(",")[2] progress = line.split(",")[3] prob_id = line.split(",")[4] total_time = time_df.loc[(sch, stu, progress, prob_id)]['total_time'].item() faopt_time = time_df.loc[(sch, stu, progress, prob_id)]['faopt_time'].item() opt_time = time_df.loc[(sch, stu, progress, prob_id)]['opt_time'].item() nonopt_time = time_df.loc[(sch, stu, progress, prob_id)]['nonopt_time'].item() feat_vec.append(faopt_time) feat_vec.append(total_time) feat_vec.append(opt_time) feat_vec.append(nonopt_time) if j == 0: print(";;;;", len(feat_vec), feat_vec) j+=1 self.feats.append(feat_vec) selected_lines.append(idex) except Exception as e: print("................>") print(e) print("Error at index: ", idex) self.label_file = open(self.label_path, "r") for idex, line in enumerate(self.label_file): if line: line = line.strip() if not line: continue if idex in selected_lines: self.labels.append(int(line)) # self.labels.append(int(line)) self.label_file.close() self.file = open(self.dataset_path, "r") for idex, line in enumerate(self.file): if line: line = line.strip() if line: if idex in selected_lines: self.lines.append(line) # self.lines.append(line) self.file.close() self.len = len(self.lines) print("Sequence length set at ", self.seq_len, len(self.lines), len(self.labels) if self.label_path else 0) def __len__(self): return self.len def __getitem__(self, item): org_line = self.lines[item].split("\t") dup_line = [] opt = False for l in org_line: if l in ["OptionalTask_1", "EquationAnswer", "NumeratorFactor", "DenominatorFactor", "OptionalTask_2", "FirstRow1:1", "FirstRow1:2", "FirstRow2:1", "FirstRow2:2", "SecondRow", "ThirdRow"]: opt = True if opt and 'FinalAnswer-' in l: dup_line.append('[UNK]') else: dup_line.append(l) dup_line = "\t".join(dup_line) # print(dup_line) s1 = self.vocab.to_seq(dup_line, self.seq_len) # This is like tokenizer and adds [CLS] and [SEP]. s1_label = self.labels[item] if self.label_path else 0 segment_label = [1 for _ in range(len(s1))] s1_feat = self.feats[item] if len(self.feats)>0 else 0 padding = [self.vocab.vocab['[PAD]'] for _ in range(self.seq_len - len(s1))] s1.extend(padding), segment_label.extend(padding) # print(s1_feat) output = {'input': s1, 'label': s1_label, 'feat': s1_feat, 'segment_label': segment_label} return {key: torch.tensor(value) for key, value in output.items()}