suryadev1 commited on
Commit
9f91555
·
1 Parent(s): 2d48e21

Upload dataset.py

Browse files

updated dataset.py

Files changed (1) hide show
  1. src/dataset.py +281 -301
src/dataset.py CHANGED
@@ -7,208 +7,18 @@ import random
7
  from .vocab import Vocab
8
  import pickle
9
  import copy
10
- # from sklearn.preprocessing import OneHotEncoder
11
 
12
- class PretrainerDataset(Dataset):
13
- """
14
- Class name: PretrainDataset
15
-
16
- """
17
- def __init__(self, dataset_path, vocab, seq_len=30, max_mask=0.15):
18
- self.dataset_path = dataset_path
19
- self.vocab = vocab # Vocab object
20
-
21
- # Related to input dataset file
22
- self.lines = []
23
- self.index_documents = {}
24
-
25
- seq_len_list = []
26
- with open(self.dataset_path, "r") as reader:
27
- i = 0
28
- index = 0
29
- self.index_documents[i] = []
30
- for line in tqdm.tqdm(reader.readlines()):
31
- if line:
32
- line = line.strip()
33
- if not line:
34
- i+=1
35
- self.index_documents[i] = []
36
- else:
37
- self.index_documents[i].append(index)
38
- self.lines.append(line.split("\t"))
39
- len_line = len(line.split("\t"))
40
- seq_len_list.append(len_line)
41
- index+=1
42
- reader.close()
43
- print("Sequence Stats: len: %s, min: %s, max: %s, average: %s"% (len(seq_len_list),
44
- min(seq_len_list), max(seq_len_list), sum(seq_len_list)/len(seq_len_list)))
45
- print("Unique Sequences: ", len({tuple(ll) for ll in self.lines}))
46
- self.index_documents = {k:v for k,v in self.index_documents.items() if v}
47
- print(len(self.index_documents))
48
- self.seq_len = seq_len
49
- print("Sequence length set at: ", self.seq_len)
50
- self.max_mask = max_mask
51
- print("% of input tokens selected for masking : ",self.max_mask)
52
-
53
-
54
- def __len__(self):
55
- return len(self.lines)
56
-
57
- def __getitem__(self, item):
58
- token_a = self.lines[item]
59
- # sa_masked = None
60
- # sa_masked_label = None
61
- # token_b = None
62
- # is_same_student = None
63
- # sb_masked = None
64
- # sb_masked_label = None
65
-
66
- # if self.select_next_seq:
67
- # is_same_student, token_b = self.get_token_b(item)
68
- # is_same_student = 1 if is_same_student else 0
69
- # token_a1, token_b1 = self.truncate_to_max_seq(token_a, token_b)
70
- # sa_masked, sa_masked_label = self.random_mask_seq(token_a1)
71
- # sb_masked, sb_masked_label = self.random_mask_seq(token_b1)
72
- # else:
73
- token_a = token_a[:self.seq_len-2]
74
- sa_masked, sa_masked_label, sa_masked_pos = self.random_mask_seq(token_a)
75
-
76
- s1 = ([self.vocab.vocab['[CLS]']] + sa_masked + [self.vocab.vocab['[SEP]']])
77
- s1_label = ([self.vocab.vocab['[PAD]']] + sa_masked_label + [self.vocab.vocab['[PAD]']])
78
- segment_label = [1 for _ in range(len(s1))]
79
- masked_pos = ([0] + sa_masked_pos + [0])
80
-
81
- # if self.select_next_seq:
82
- # s1 = s1 + sb_masked + [self.vocab.vocab['[SEP]']]
83
- # s1_label = s1_label + sb_masked_label + [self.vocab.vocab['[PAD]']]
84
- # segment_label = segment_label + [2 for _ in range(len(sb_masked)+1)]
85
-
86
- padding = [self.vocab.vocab['[PAD]'] for _ in range(self.seq_len - len(s1))]
87
- s1.extend(padding)
88
- s1_label.extend(padding)
89
- segment_label.extend(padding)
90
- masked_pos.extend(padding)
91
-
92
- output = {'bert_input': s1,
93
- 'bert_label': s1_label,
94
- 'segment_label': segment_label,
95
- 'masked_pos': masked_pos}
96
- # print(f"tokenA: {token_a}")
97
- # print(f"output: {output}")
98
-
99
- # if self.select_next_seq:
100
- # output['is_same_student'] = is_same_student
101
-
102
- # print(item, len(s1), len(s1_label), len(segment_label))
103
- # print(f"{item}.")
104
- return {key: torch.tensor(value) for key, value in output.items()}
105
-
106
- def random_mask_seq(self, tokens):
107
- """
108
- Input: original token seq
109
- Output: masked token seq, output label
110
- """
111
-
112
- masked_pos = []
113
- output_labels = []
114
- output_tokens = copy.deepcopy(tokens)
115
- opt_step = False
116
- for i, token in enumerate(tokens):
117
- if token in ['OptionalTask_1', 'EquationAnswer', 'NumeratorFactor', 'DenominatorFactor', 'OptionalTask_2', 'FirstRow1:1', 'FirstRow1:2', 'FirstRow2:1', 'FirstRow2:2', 'SecondRow', 'ThirdRow']:
118
- opt_step = True
119
- # if opt_step:
120
- # prob = random.random()
121
- # if prob < self.max_mask:
122
- # output_tokens[i] = random.choice([3,7,8,9,11,12,13,14,15,16,22,23,24,25,26,27,30,31,32])
123
- # masked_pos.append(1)
124
- # else:
125
- # output_tokens[i] = self.vocab.vocab.get(token, self.vocab.vocab['[UNK]'])
126
- # masked_pos.append(0)
127
- # output_labels.append(self.vocab.vocab.get(token, self.vocab.vocab['[UNK]']))
128
- # opt_step = False
129
- # else:
130
- prob = random.random()
131
- if prob < self.max_mask:
132
- # chooses 15% of token positions at random
133
- # prob /= 0.15
134
- prob = random.random()
135
- if prob < 0.8: #[MASK] token 80% of the time
136
- output_tokens[i] = self.vocab.vocab['[MASK]']
137
- masked_pos.append(1)
138
- elif prob < 0.9: # a random token 10% of the time
139
- # print(".......0.8-0.9......")
140
- if opt_step:
141
- output_tokens[i] = random.choice([7,8,9,11,12,13,14,15,16,22,23,24,25,26,27,30,31,32])
142
- opt_step = False
143
- else:
144
- output_tokens[i] = random.randint(1, len(self.vocab.vocab)-1)
145
- masked_pos.append(1)
146
- else: # the unchanged i-th token 10% of the time
147
- # print(".......unchanged......")
148
- output_tokens[i] = self.vocab.vocab.get(token, self.vocab.vocab['[UNK]'])
149
- masked_pos.append(0)
150
- # True Label
151
- output_labels.append(self.vocab.vocab.get(token, self.vocab.vocab['[UNK]']))
152
- # masked_pos_label[i] = self.vocab.vocab.get(token, self.vocab.vocab['[UNK]'])
153
- else:
154
- # i-th token with original value
155
- output_tokens[i] = self.vocab.vocab.get(token, self.vocab.vocab['[UNK]'])
156
- # Padded label
157
- output_labels.append(self.vocab.vocab['[PAD]'])
158
- masked_pos.append(0)
159
- # label_position = []
160
- # label_tokens = []
161
- # for k, v in masked_pos_label.items():
162
- # label_position.append(k)
163
- # label_tokens.append(v)
164
- return output_tokens, output_labels, masked_pos
165
-
166
- # def get_token_b(self, item):
167
- # document_id = [k for k,v in self.index_documents.items() if item in v][0]
168
- # random_document_id = document_id
169
-
170
- # if random.random() < 0.5:
171
- # document_ids = [k for k in self.index_documents.keys() if k != document_id]
172
- # random_document_id = random.choice(document_ids)
173
-
174
- # same_student = (random_document_id == document_id)
175
-
176
- # nex_seq_list = self.index_documents.get(random_document_id)
177
-
178
- # if same_student:
179
- # if len(nex_seq_list) != 1:
180
- # nex_seq_list = [v for v in nex_seq_list if v !=item]
181
-
182
- # next_seq = random.choice(nex_seq_list)
183
- # tokens = self.lines[next_seq]
184
- # # print(f"item = {item}, tokens: {tokens}")
185
- # # print(f"item={item}, next={next_seq}, same_student = {same_student}, {document_id} == {random_document_id}, b. {tokens}")
186
- # return same_student, tokens
187
-
188
- # def truncate_to_max_seq(self, s1, s2):
189
- # sa = copy.deepcopy(s1)
190
- # sb = copy.deepcopy(s1)
191
- # total_allowed_seq = self.seq_len - 3
192
-
193
- # while((len(sa)+len(sb)) > total_allowed_seq):
194
- # if random.random() < 0.5:
195
- # sa.pop()
196
- # else:
197
- # sb.pop()
198
- # return sa, sb
199
-
200
-
201
  class TokenizerDataset(Dataset):
202
  """
203
  Class name: TokenizerDataset
204
  Tokenize the data in the dataset
205
-
206
  """
207
  def __init__(self, dataset_path, label_path, vocab, seq_len=30):
208
  self.dataset_path = dataset_path
209
  self.label_path = label_path
210
  self.vocab = vocab # Vocab object
211
- # self.encoder = OneHotEncoder(sparse=False)
212
 
213
  # Related to input dataset file
214
  self.lines = []
@@ -242,39 +52,14 @@ class TokenizerDataset(Dataset):
242
  feat2 = [float(i) for i in line.split(",")[-2].split("\t")]
243
  feat_vec.extend(feat2[1:])
244
 
245
- # # highGRschool_w_prior_w_p_diffskill_wo_fa
246
- # feat_vec = [float(i) for i in line.split(",")[-3].split("\t")]
247
- # feat2 = [-float(i) for i in line.split(",")[-2].split("\t")]
248
- # feat_vec.extend(feat2[1:])
249
-
250
- # # highGRschool_w_prior_w_diffskill_0fa_skill
251
- # feat_vec = [float(i) for i in line.split(",")[-3].split("\t")]
252
- # feat2 = [float(i) for i in line.split(",")[-2].split("\t")]
253
- # fa_feat_vec = [float(i) for i in line.split(",")[-1].split("\t")]
254
-
255
- # diff_skill = [f2 if f1==0 else 0 for f2, f1 in zip(feat2, fa_feat_vec)]
256
- # feat_vec.extend(diff_skill)
257
-
258
  if j == 0:
259
  print(len(feat_vec))
260
  j+=1
261
-
262
- # feat_vec.extend(feat2[1:])
263
- # feat_vec.extend(feat2)
264
- # feat_vec = [float(i) for i in line.split(",")[-2].split("\t")]
265
- # feat_vec = feat_vec[1:]
266
- # feat_vec = [float(line.split(",")[-1])]
267
- # feat_vec = [float(i) for i in line.split(",")[-1].split("\t")]
268
- # feat_vec = [ft-f1 for ft, f1 in zip(feat_vec, fa_feat_vec)]
269
-
270
  self.feats.append(feat_vec)
271
  dataset_info_file.close()
272
  except Exception as e:
273
  print(e)
274
- # labeler = np.array([0, 1]) #np.unique(self.labels)
275
- # print(f"Labeler {labeler}")
276
- # self.encoder.fit(labeler.reshape(-1,1))
277
- # self.labels = self.encoder.transform(np.array(self.labels).reshape(-1,1))
278
 
279
  self.file = open(self.dataset_path, "r")
280
  for line in self.file:
@@ -317,95 +102,197 @@ class TokenizerDataset(Dataset):
317
  'segment_label': segment_label}
318
  return {key: torch.tensor(value) for key, value in output.items()}
319
 
320
-
321
- class TokenizerDatasetForCalibration(Dataset):
322
  """
323
- Class name: TokenizerDataset
324
- Tokenize the data in the dataset
325
-
326
  """
327
  def __init__(self, dataset_path, label_path, vocab, seq_len=30):
 
 
 
328
  self.dataset_path = dataset_path
329
  self.label_path = label_path
330
  self.vocab = vocab # Vocab object
331
- # self.encoder = OneHotEncoder(sparse=False)
332
-
333
  # Related to input dataset file
334
  self.lines = []
335
  self.labels = []
336
  self.feats = []
 
 
 
 
337
  if self.label_path:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
  self.label_file = open(self.label_path, "r")
339
- for line in self.label_file:
340
  if line:
341
  line = line.strip()
342
  if not line:
343
  continue
344
- self.labels.append(int(line))
 
 
345
  self.label_file.close()
346
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
  # Comment this section if you are not using feat attribute
348
- try:
349
- j = 0
350
- dataset_info_file = open(self.label_path.replace("label", "info"), "r")
351
- for line in dataset_info_file:
 
352
  if line:
353
  line = line.strip()
354
  if not line:
355
  continue
356
-
357
- # # highGRschool_w_prior
358
- # feat_vec = [float(i) for i in line.split(",")[-3].split("\t")]
359
-
360
- # highGRschool_w_prior_w_diffskill_wo_fa
361
- feat_vec = [float(i) for i in line.split(",")[-3].split("\t")]
362
- feat2 = [float(i) for i in line.split(",")[-2].split("\t")]
363
- feat_vec.extend(feat2[1:])
364
-
365
- # # highGRschool_w_prior_w_diffskill_0fa_skill
366
- # feat_vec = [float(i) for i in line.split(",")[-3].split("\t")]
367
- # feat2 = [float(i) for i in line.split(",")[-2].split("\t")]
368
- # fa_feat_vec = [float(i) for i in line.split(",")[-1].split("\t")]
369
-
370
- # diff_skill = [f2 if f1==0 else 0 for f2, f1 in zip(feat2, fa_feat_vec)]
371
- # feat_vec.extend(diff_skill)
372
-
 
373
  if j == 0:
374
- print(len(feat_vec))
375
  j+=1
376
-
377
- # feat_vec.extend(feat2[1:])
378
- # feat_vec.extend(feat2)
379
- # feat_vec = [float(i) for i in line.split(",")[-2].split("\t")]
380
- # feat_vec = feat_vec[1:]
381
- # feat_vec = [float(line.split(",")[-1])]
382
- # feat_vec = [float(i) for i in line.split(",")[-1].split("\t")]
383
- # feat_vec = [ft-f1 for ft, f1 in zip(feat_vec, fa_feat_vec)]
384
-
385
  self.feats.append(feat_vec)
386
- dataset_info_file.close()
387
- except Exception as e:
388
- print(e)
389
- # labeler = np.array([0, 1]) #np.unique(self.labels)
390
- # print(f"Labeler {labeler}")
391
- # self.encoder.fit(labeler.reshape(-1,1))
392
- # self.labels = self.encoder.transform(np.array(self.labels).reshape(-1,1))
 
 
 
 
 
 
 
 
 
393
 
394
  self.file = open(self.dataset_path, "r")
395
- for line in self.file:
396
  if line:
397
  line = line.strip()
398
  if line:
399
- self.lines.append(line)
400
- self.file.close()
401
-
 
402
  self.len = len(self.lines)
403
- self.seq_len = seq_len
404
  print("Sequence length set at ", self.seq_len, len(self.lines), len(self.labels) if self.label_path else 0)
405
-
406
  def __len__(self):
407
  return self.len
408
-
409
  def __getitem__(self, item):
410
  org_line = self.lines[item].split("\t")
411
  dup_line = []
@@ -413,7 +300,7 @@ class TokenizerDatasetForCalibration(Dataset):
413
  for l in org_line:
414
  if l in ["OptionalTask_1", "EquationAnswer", "NumeratorFactor", "DenominatorFactor", "OptionalTask_2", "FirstRow1:1", "FirstRow1:2", "FirstRow2:1", "FirstRow2:2", "SecondRow", "ThirdRow"]:
415
  opt = True
416
- if opt and 'FinalAnswer-' in l:
417
  dup_line.append('[UNK]')
418
  else:
419
  dup_line.append(l)
@@ -425,35 +312,128 @@ class TokenizerDatasetForCalibration(Dataset):
425
  s1_feat = self.feats[item] if len(self.feats)>0 else 0
426
  padding = [self.vocab.vocab['[PAD]'] for _ in range(self.seq_len - len(s1))]
427
  s1.extend(padding), segment_label.extend(padding)
428
-
 
429
  output = {'input': s1,
430
  'label': s1_label,
431
  'feat': s1_feat,
432
  'segment_label': segment_label}
433
- return ({key: torch.tensor(value) for key, value in output.items()}, s1_label)
434
-
435
-
436
-
437
- # if __name__ == "__main__":
438
- # # import pickle
439
- # # k = pickle.load(open("dataset/CL4999_1920/unique_steps_list.pkl","rb"))
440
- # # print(k)
441
- # vocab_obj = Vocab("pretraining/vocab.txt")
442
- # vocab_obj.load_vocab()
443
- # datasetTrain = PretrainerDataset("pretraining/pretrain.txt", vocab_obj)
444
-
445
- # print(datasetTrain, len(datasetTrain))#, datasetTrain.documents_index)
446
- # print(datasetTrain[len(datasetTrain)-1])
447
- # for i, d in enumerate(datasetTrain):
448
- # print(d.items())
449
- # break
450
-
451
- # fine_tune = TokenizerDataset("finetuning/finetune.txt", "finetuning/finetune_label.txt", vocab_obj)
452
- # print(fine_tune)
453
- # print(fine_tune[len(fine_tune)-1])
454
- # print(fine_tune[random.randint(0, len(fine_tune))])
455
- # for i, d in enumerate(fine_tune):
456
- # print(d.items())
457
- # break
458
 
459
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  from .vocab import Vocab
8
  import pickle
9
  import copy
10
+ import os
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  class TokenizerDataset(Dataset):
13
  """
14
  Class name: TokenizerDataset
15
  Tokenize the data in the dataset
16
+ Feat length: 17
17
  """
18
  def __init__(self, dataset_path, label_path, vocab, seq_len=30):
19
  self.dataset_path = dataset_path
20
  self.label_path = label_path
21
  self.vocab = vocab # Vocab object
 
22
 
23
  # Related to input dataset file
24
  self.lines = []
 
52
  feat2 = [float(i) for i in line.split(",")[-2].split("\t")]
53
  feat_vec.extend(feat2[1:])
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  if j == 0:
56
  print(len(feat_vec))
57
  j+=1
58
+
 
 
 
 
 
 
 
 
59
  self.feats.append(feat_vec)
60
  dataset_info_file.close()
61
  except Exception as e:
62
  print(e)
 
 
 
 
63
 
64
  self.file = open(self.dataset_path, "r")
65
  for line in self.file:
 
102
  'segment_label': segment_label}
103
  return {key: torch.tensor(value) for key, value in output.items()}
104
 
105
+ class TokenizerwSkillsDataset(Dataset):
 
106
  """
107
+ Feature length: 17
108
+
 
109
  """
110
  def __init__(self, dataset_path, label_path, vocab, seq_len=30):
111
+ print(f"dataset_path: {dataset_path}")
112
+ print(f"label_path: {label_path}")
113
+
114
  self.dataset_path = dataset_path
115
  self.label_path = label_path
116
  self.vocab = vocab # Vocab object
117
+ self.seq_len = seq_len
118
+
119
  # Related to input dataset file
120
  self.lines = []
121
  self.labels = []
122
  self.feats = []
123
+ selected_lines = []
124
+
125
+ print("TokenizerwSkillsDataset...............................")
126
+
127
  if self.label_path:
128
+ # Comment this section if you are not using feat attribute
129
+ dataset_info_file = open(self.label_path.replace("label", "info"), "r").readlines()
130
+ print(">>>>>>>>>>>>>>>>>", len(dataset_info_file))
131
+ j = 0
132
+ for idex, line in enumerate(dataset_info_file):
133
+ try:
134
+ if line:
135
+ line = line.strip()
136
+ if not line:
137
+ continue
138
+
139
+ feat_vec = [float(i) for i in line.split(",")[-9].split("\t")]
140
+ feat2 = [float(i) for i in line.split(",")[-8].split("\t")]
141
+ feat_vec.extend(feat2[1:])
142
+
143
+ if j == 0:
144
+ print(";;;;", len(feat_vec), feat_vec)
145
+ j+=1
146
+ self.feats.append(feat_vec)
147
+ selected_lines.append(idex)
148
+ except Exception as e:
149
+ print("................>")
150
+ print(e)
151
+ print("Error at index: ", idex)
152
+
153
  self.label_file = open(self.label_path, "r")
154
+ for idex, line in enumerate(self.label_file):
155
  if line:
156
  line = line.strip()
157
  if not line:
158
  continue
159
+ if idex in selected_lines:
160
+ self.labels.append(int(line))
161
+ # self.labels.append(int(line))
162
  self.label_file.close()
163
+
164
+ self.file = open(self.dataset_path, "r")
165
+ for idex, line in enumerate(self.file):
166
+ if line:
167
+ line = line.strip()
168
+ if line:
169
+ if idex in selected_lines:
170
+ self.lines.append(line)
171
+ # self.lines.append(line)
172
+ self.file.close()
173
+ self.len = len(self.lines)
174
+ print("Sequence length set at ", self.seq_len, len(self.lines), len(self.labels) if self.label_path else 0)
175
+
176
+ def __len__(self):
177
+ return self.len
178
+
179
+ def __getitem__(self, item):
180
+ org_line = self.lines[item].split("\t")
181
+ dup_line = []
182
+ opt = False
183
+ for l in org_line:
184
+ if l in ["OptionalTask_1", "EquationAnswer", "NumeratorFactor", "DenominatorFactor", "OptionalTask_2", "FirstRow1:1", "FirstRow1:2", "FirstRow2:1", "FirstRow2:2", "SecondRow", "ThirdRow"]:
185
+ opt = True
186
+ if opt and 'FinalAnswer-' in l:
187
+ dup_line.append('[UNK]')
188
+ else:
189
+ dup_line.append(l)
190
+ dup_line = "\t".join(dup_line)
191
+ # print(dup_line)
192
+ s1 = self.vocab.to_seq(dup_line, self.seq_len) # This is like tokenizer and adds [CLS] and [SEP].
193
+ s1_label = self.labels[item] if self.label_path else 0
194
+ segment_label = [1 for _ in range(len(s1))]
195
+ s1_feat = self.feats[item] if len(self.feats)>0 else 0
196
+ padding = [self.vocab.vocab['[PAD]'] for _ in range(self.seq_len - len(s1))]
197
+ s1.extend(padding), segment_label.extend(padding)
198
+ # print(s1_feat)
199
+
200
+ output = {'input': s1,
201
+ 'label': s1_label,
202
+ 'feat': s1_feat,
203
+ 'segment_label': segment_label}
204
+ return {key: torch.tensor(value) for key, value in output.items()}
205
+
206
+
207
+ class TokenizerwTimeDataset(Dataset):
208
+ """
209
+ Feature length: 4
210
+
211
+ """
212
+ def __init__(self, dataset_path, label_path, vocab, seq_len=30):
213
+ print(f"dataset_path: {dataset_path}")
214
+ print(f"label_path: {label_path}")
215
+
216
+ self.dataset_path = dataset_path
217
+ self.label_path = label_path
218
+ self.vocab = vocab # Vocab object
219
+ self.seq_len = seq_len
220
+
221
+ # Related to input dataset file
222
+ self.lines = []
223
+ self.labels = []
224
+ self.feats = []
225
+ selected_lines = []
226
+
227
+ print("TokenizerwTimeDataset...............................")
228
+ time_df = pickle.load(open("ratio_proportion_change3_2223/sch_largest_100-coded/time_info/full_data_normalized_time.pkl", "rb"))
229
+ print("time: ?? ", time_df.shape)
230
+
231
+ if self.label_path:
232
  # Comment this section if you are not using feat attribute
233
+ dataset_info_file = open(self.label_path.replace("label", "info"), "r").readlines()
234
+ print(">>>>>>>>>>>>>>>>>", len(dataset_info_file))
235
+ j = 0
236
+ for idex, line in enumerate(dataset_info_file):
237
+ try:
238
  if line:
239
  line = line.strip()
240
  if not line:
241
  continue
242
+
243
+ feat_vec = []
244
+
245
+ sch = line.split(",")[0]
246
+ stu = line.split(",")[2]
247
+ progress = line.split(",")[3]
248
+ prob_id = line.split(",")[4]
249
+
250
+ total_time = time_df.loc[(sch, stu, progress, prob_id)]['total_time'].item()
251
+ faopt_time = time_df.loc[(sch, stu, progress, prob_id)]['faopt_time'].item()
252
+ opt_time = time_df.loc[(sch, stu, progress, prob_id)]['opt_time'].item()
253
+ nonopt_time = time_df.loc[(sch, stu, progress, prob_id)]['nonopt_time'].item()
254
+
255
+ feat_vec.append(faopt_time)
256
+ feat_vec.append(total_time)
257
+ feat_vec.append(opt_time)
258
+ feat_vec.append(nonopt_time)
259
+
260
  if j == 0:
261
+ print(";;;;", len(feat_vec), feat_vec)
262
  j+=1
 
 
 
 
 
 
 
 
 
263
  self.feats.append(feat_vec)
264
+ selected_lines.append(idex)
265
+ except Exception as e:
266
+ print("................>")
267
+ print(e)
268
+ print("Error at index: ", idex)
269
+
270
+ self.label_file = open(self.label_path, "r")
271
+ for idex, line in enumerate(self.label_file):
272
+ if line:
273
+ line = line.strip()
274
+ if not line:
275
+ continue
276
+ if idex in selected_lines:
277
+ self.labels.append(int(line))
278
+ # self.labels.append(int(line))
279
+ self.label_file.close()
280
 
281
  self.file = open(self.dataset_path, "r")
282
+ for idex, line in enumerate(self.file):
283
  if line:
284
  line = line.strip()
285
  if line:
286
+ if idex in selected_lines:
287
+ self.lines.append(line)
288
+ # self.lines.append(line)
289
+ self.file.close()
290
  self.len = len(self.lines)
 
291
  print("Sequence length set at ", self.seq_len, len(self.lines), len(self.labels) if self.label_path else 0)
292
+
293
  def __len__(self):
294
  return self.len
295
+
296
  def __getitem__(self, item):
297
  org_line = self.lines[item].split("\t")
298
  dup_line = []
 
300
  for l in org_line:
301
  if l in ["OptionalTask_1", "EquationAnswer", "NumeratorFactor", "DenominatorFactor", "OptionalTask_2", "FirstRow1:1", "FirstRow1:2", "FirstRow2:1", "FirstRow2:2", "SecondRow", "ThirdRow"]:
302
  opt = True
303
+ if opt and 'FinalAnswer-' in l:
304
  dup_line.append('[UNK]')
305
  else:
306
  dup_line.append(l)
 
312
  s1_feat = self.feats[item] if len(self.feats)>0 else 0
313
  padding = [self.vocab.vocab['[PAD]'] for _ in range(self.seq_len - len(s1))]
314
  s1.extend(padding), segment_label.extend(padding)
315
+ # print(s1_feat)
316
+
317
  output = {'input': s1,
318
  'label': s1_label,
319
  'feat': s1_feat,
320
  'segment_label': segment_label}
321
+ return {key: torch.tensor(value) for key, value in output.items()}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
 
323
+ class TokenizerwSkillsTimeDataset(Dataset):
324
+ """
325
+ Feature length: 17+4 = 21
326
+
327
+ """
328
+ def __init__(self, dataset_path, label_path, vocab, seq_len=30):
329
+ print(f"dataset_path: {dataset_path}")
330
+ print(f"label_path: {label_path}")
331
+
332
+ self.dataset_path = dataset_path
333
+ self.label_path = label_path
334
+ self.vocab = vocab # Vocab object
335
+ self.seq_len = seq_len
336
+
337
+ # Related to input dataset file
338
+ self.lines = []
339
+ self.labels = []
340
+ self.feats = []
341
+ selected_lines = []
342
+
343
+ print("TokenizerwSkillsTimeDataset...............................")
344
+ time_df = pickle.load(open("ratio_proportion_change3_2223/sch_largest_100-coded/time_info/full_data_normalized_time.pkl", "rb"))
345
+ print("time: ", time_df.shape)
346
+
347
+ if self.label_path:
348
+ # Comment this section if you are not using feat attribute
349
+ dataset_info_file = open(self.label_path.replace("label", "info"), "r").readlines()
350
+ print(">>>>>>>>>>>>>>>>>", len(dataset_info_file))
351
+ j = 0
352
+ for idex, line in enumerate(dataset_info_file):
353
+ try:
354
+ if line:
355
+ line = line.strip()
356
+ if not line:
357
+ continue
358
+
359
+ feat_vec = [float(i) for i in line.split(",")[-9].split("\t")]
360
+ feat2 = [float(i) for i in line.split(",")[-8].split("\t")]
361
+ feat_vec.extend(feat2[1:])
362
+
363
+ sch = line.split(",")[0]
364
+ stu = line.split(",")[2]
365
+ progress = line.split(",")[3]
366
+ prob_id = line.split(",")[4]
367
+
368
+ total_time = time_df.loc[(sch, stu, progress, prob_id)]['total_time'].item()
369
+ faopt_time = time_df.loc[(sch, stu, progress, prob_id)]['faopt_time'].item()
370
+ opt_time = time_df.loc[(sch, stu, progress, prob_id)]['opt_time'].item()
371
+ nonopt_time = time_df.loc[(sch, stu, progress, prob_id)]['nonopt_time'].item()
372
+
373
+ feat_vec.append(faopt_time)
374
+ feat_vec.append(total_time)
375
+ feat_vec.append(opt_time)
376
+ feat_vec.append(nonopt_time)
377
+
378
+ if j == 0:
379
+ print(";;;;", len(feat_vec), feat_vec)
380
+ j+=1
381
+ self.feats.append(feat_vec)
382
+ selected_lines.append(idex)
383
+ except Exception as e:
384
+ print("................>")
385
+ print(e)
386
+ print("Error at index: ", idex)
387
+
388
+ self.label_file = open(self.label_path, "r")
389
+ for idex, line in enumerate(self.label_file):
390
+ if line:
391
+ line = line.strip()
392
+ if not line:
393
+ continue
394
+ if idex in selected_lines:
395
+ self.labels.append(int(line))
396
+ # self.labels.append(int(line))
397
+ self.label_file.close()
398
+
399
+ self.file = open(self.dataset_path, "r")
400
+ for idex, line in enumerate(self.file):
401
+ if line:
402
+ line = line.strip()
403
+ if line:
404
+ if idex in selected_lines:
405
+ self.lines.append(line)
406
+ # self.lines.append(line)
407
+ self.file.close()
408
+ self.len = len(self.lines)
409
+ print("Sequence length set at ", self.seq_len, len(self.lines), len(self.labels) if self.label_path else 0)
410
+
411
+ def __len__(self):
412
+ return self.len
413
+
414
+ def __getitem__(self, item):
415
+ org_line = self.lines[item].split("\t")
416
+ dup_line = []
417
+ opt = False
418
+ for l in org_line:
419
+ if l in ["OptionalTask_1", "EquationAnswer", "NumeratorFactor", "DenominatorFactor", "OptionalTask_2", "FirstRow1:1", "FirstRow1:2", "FirstRow2:1", "FirstRow2:2", "SecondRow", "ThirdRow"]:
420
+ opt = True
421
+ if opt and 'FinalAnswer-' in l:
422
+ dup_line.append('[UNK]')
423
+ else:
424
+ dup_line.append(l)
425
+ dup_line = "\t".join(dup_line)
426
+ # print(dup_line)
427
+ s1 = self.vocab.to_seq(dup_line, self.seq_len) # This is like tokenizer and adds [CLS] and [SEP].
428
+ s1_label = self.labels[item] if self.label_path else 0
429
+ segment_label = [1 for _ in range(len(s1))]
430
+ s1_feat = self.feats[item] if len(self.feats)>0 else 0
431
+ padding = [self.vocab.vocab['[PAD]'] for _ in range(self.seq_len - len(s1))]
432
+ s1.extend(padding), segment_label.extend(padding)
433
+ # print(s1_feat)
434
+
435
+ output = {'input': s1,
436
+ 'label': s1_label,
437
+ 'feat': s1_feat,
438
+ 'segment_label': segment_label}
439
+ return {key: torch.tensor(value) for key, value in output.items()}