Pratham Bhat commited on
Commit
a6c4516
·
1 Parent(s): e165882

Trace logs, and model loading occurs only once at the start

Browse files
Files changed (1) hide show
  1. main.py +41 -28
main.py CHANGED
@@ -18,9 +18,16 @@ import torch
18
  # torch.mps.empty_cache()
19
  # torch.set_num_threads(1)
20
 
21
- # import os
22
  # os.environ["HF_HOME"] = "/.cache"
23
  # os.environ["TRANSFORMERS_CACHE"] = "/.cache"
 
 
 
 
 
 
 
24
 
25
 
26
  app = FastAPI()
@@ -42,40 +49,45 @@ def format_prompt(system, message, history):
42
  prompt += {"role": "user", "content": message}
43
  return prompt
44
 
45
- # def setup():
46
- # device = "cuda" if torch.cuda.is_available() else "cpu"
47
 
48
- # # if torch.backends.mps.is_available():
49
- # # device = torch.device("mps")
50
- # # x = torch.ones(1, device=device)
51
- # # print (x)
52
- # # else:
53
- # # device="cpu"
54
- # # print ("MPS device not found.")
55
 
56
- # # device = "auto"
57
- # # device=torch.device("cpu")
58
 
59
- # model_path = "ibm-granite/granite-34b-code-instruct-8k"
60
- # tokenizer = AutoTokenizer.from_pretrained(model_path)
61
- # # drop device_map if running on CPU
62
- # model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device)
63
- # model.eval()
64
-
65
- # return model, tokenizer, device
66
-
67
- def generate(item: Item):
68
- device = "cuda" if torch.cuda.is_available() else "cpu"
69
-
70
  model_path = "ibm-granite/granite-34b-code-instruct-8k"
71
-
72
  print("Loading tokenizer for model: " + model_path, file=sys.stderr)
73
- tokenizer = AutoTokenizer.from_pretrained(model_path, cache_dir="/code/huggingface/transformers")
74
- # drop device_map if running on CPU
75
 
76
  print("Loading Model for causal LM for model: " + model_path, file=sys.stderr)
 
77
  model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device)
78
  model.eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  # change input text as desired
80
  chat = format_prompt(item.system_prompt, item.prompt, item.history)
81
  chat = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
@@ -97,13 +109,14 @@ def generate(item: Item):
97
  output_text = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
98
  return output_text
99
 
 
100
 
101
  # model, tokenizer, device = setup()
102
 
103
  @app.post("/generate/")
104
  async def generate_text(item: Item):
105
- return {"response": generate(item)}
106
- # return {"response": generate(item, model, tokenizer, device)}
107
 
108
  @app.get("/")
109
  async def generate_text_root(item: Item):
 
18
  # torch.mps.empty_cache()
19
  # torch.set_num_threads(1)
20
 
21
+ import os
22
  # os.environ["HF_HOME"] = "/.cache"
23
  # os.environ["TRANSFORMERS_CACHE"] = "/.cache"
24
+ os.environ["TQDM_DISABLE"] = "0"
25
+ os.environ["TQDM_FORCE"] = "1"
26
+
27
+ from transformers.utils import logging
28
+ logging.set_verbosity_info()
29
+ logger = logging.get_logger() # optional: get a logger instance if you want to customize
30
+ logger.info("Hugging Face Transformers download started.")
31
 
32
 
33
  app = FastAPI()
 
49
  prompt += {"role": "user", "content": message}
50
  return prompt
51
 
52
+ def setup():
53
+ device = "cuda" if torch.cuda.is_available() else "cpu"
54
 
55
+ # if torch.backends.mps.is_available():
56
+ # device = torch.device("mps")
57
+ # x = torch.ones(1, device=device)
58
+ # print (x)
59
+ # else:
60
+ # device="cpu"
61
+ # print ("MPS device not found.")
62
 
63
+ # device = "auto"
64
+ # device=torch.device("cpu")
65
 
 
 
 
 
 
 
 
 
 
 
 
66
  model_path = "ibm-granite/granite-34b-code-instruct-8k"
 
67
  print("Loading tokenizer for model: " + model_path, file=sys.stderr)
68
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
 
69
 
70
  print("Loading Model for causal LM for model: " + model_path, file=sys.stderr)
71
+ # drop device_map if running on CPU
72
  model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device)
73
  model.eval()
74
+
75
+ return model, tokenizer, device
76
+
77
+ def generate(item: Item, model, tokenizer, device):
78
+ # device = "cuda" if torch.cuda.is_available() else "cpu"
79
+
80
+ # model_path = "ibm-granite/granite-34b-code-instruct-8k"
81
+
82
+ # print("Loading tokenizer for model: " + model_path, file=sys.stderr)
83
+ # tokenizer = AutoTokenizer.from_pretrained(model_path, cache_dir="/code/huggingface/transformers")
84
+ # # drop device_map if running on CPU
85
+
86
+ # print("Loading Model for causal LM for model: " + model_path, file=sys.stderr)
87
+ # model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device)
88
+ # model.eval()
89
+
90
+ print("Adapting the input into a template...", file=sys.stderr)
91
  # change input text as desired
92
  chat = format_prompt(item.system_prompt, item.prompt, item.history)
93
  chat = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
 
109
  output_text = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
110
  return output_text
111
 
112
+ model, tokenizer, device = setup()
113
 
114
  # model, tokenizer, device = setup()
115
 
116
  @app.post("/generate/")
117
  async def generate_text(item: Item):
118
+ # return {"response": generate(item)}
119
+ return {"response": generate(item, model, tokenizer, device)}
120
 
121
  @app.get("/")
122
  async def generate_text_root(item: Item):