File size: 3,061 Bytes
5400cf3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, TextStreamer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, TaskType
from trl import SFTTrainer
import torch


# Clear CUDA cache before loading models
torch.cuda.empty_cache()

device = torch.device("cuda")

dataset = load_dataset("koutch/stackoverflow_python")

dataset = dataset["train"]

def format_example(example):
    return {
        "text": f"### Question:\n{example['question_body'].strip()}\n\n### Answer:\n{example['answer_body'].strip()}"
    }

dataset = dataset.map(format_example)
dataset = dataset.filter(lambda x: x["text"] is not None and len(x["text"]) > 0)
dataset = dataset.shuffle(seed=42).select(range(30_000))

tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")

tokenizer.add_special_tokens({'additional_special_tokens': ["<|user|>", "<|assistant|>"]})

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Causal LM, so no masked LM
)

model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",device_map="cuda",torch_dtype=torch.float16)

model.resize_token_embeddings(len(tokenizer))

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)

def tokenize_function(example):
    result = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=1024,
    )
    result["text"] = example["text"]  # Keep original text
    return result

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Step 3: Training arguments
training_args = TrainingArguments(
    output_dir="/home/deshpa70",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,
    num_train_epochs=1,
    logging_dir="./logs",
    save_total_limit=2,
    logging_steps=250,
    save_steps=500,
    learning_rate=3e-4,
    bf16=True,
    optim="paged_adamw_8bit",
    report_to="none"
)

trainer = SFTTrainer(
    peft_config=lora_config,
    model=model,
    train_dataset=tokenized_dataset,
    args=training_args,
    data_collator=data_collator
)

trainer.train()
model.eval()

while True:
        prompt = input("\n Enter your programming question (or type 'exit' to quit):\n> ")
        if prompt.lower() == 'exit':
            break

        formatted_prompt = f"<|user|>\n{prompt}\n<|assistant|>\n"
        inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)
        streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)


        output = model.generate(
            **inputs,
            max_new_tokens=50000,
            do_sample=True,
            top_p=0.9,
            temperature=0.8,
            repetition_penalty=1.1,
            streamer=streamer
        )


# Clear CUDA cache before loading models
torch.cuda.empty_cache()