bai-granite

Build error

App Files Files Community

Pratham Bhat commited on Apr 10

Commit

a6c4516

1 Parent(s): e165882

Trace logs, and model loading occurs only once at the start

Browse files

Files changed (1) hide show

main.py +41 -28

main.py CHANGED Viewed

@@ -18,9 +18,16 @@ import torch
 # torch.mps.empty_cache()
 # torch.set_num_threads(1)
-# import os
 # os.environ["HF_HOME"] = "/.cache"
 # os.environ["TRANSFORMERS_CACHE"] = "/.cache"
 app = FastAPI()
@@ -42,40 +49,45 @@ def format_prompt(system, message, history):
     prompt += {"role": "user", "content": message}
     return prompt
-# def setup():
-#     device = "cuda" if torch.cuda.is_available() else "cpu"
-#     # if torch.backends.mps.is_available():
-#     #     device = torch.device("mps")
-#     #     x = torch.ones(1, device=device)
-#     #     print (x)
-#     # else:
-#     #     device="cpu"
-#     #     print ("MPS device not found.")
-#     # device = "auto"
-#     # device=torch.device("cpu")
-#     model_path = "ibm-granite/granite-34b-code-instruct-8k"
-#     tokenizer = AutoTokenizer.from_pretrained(model_path)
-#     # drop device_map if running on CPU
-#     model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device)
-#     model.eval()
-#     return model, tokenizer, device
-def generate(item: Item):
-    device = "cuda" if torch.cuda.is_available() else "cpu"
     model_path = "ibm-granite/granite-34b-code-instruct-8k"
     print("Loading tokenizer for model: " + model_path, file=sys.stderr)
-    tokenizer = AutoTokenizer.from_pretrained(model_path, cache_dir="/code/huggingface/transformers")
-    # drop device_map if running on CPU
     print("Loading Model for causal LM for model: " + model_path, file=sys.stderr)
     model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device)
     model.eval()
     # change input text as desired
     chat = format_prompt(item.system_prompt, item.prompt, item.history)
     chat = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
@@ -97,13 +109,14 @@ def generate(item: Item):
     output_text = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
     return output_text
 # model, tokenizer, device = setup()
 @app.post("/generate/")
 async def generate_text(item: Item):
-    return {"response": generate(item)}
-    # return {"response": generate(item, model, tokenizer, device)}
 @app.get("/")
 async def generate_text_root(item: Item):

 # torch.mps.empty_cache()
 # torch.set_num_threads(1)
+import os
 # os.environ["HF_HOME"] = "/.cache"
 # os.environ["TRANSFORMERS_CACHE"] = "/.cache"
+os.environ["TQDM_DISABLE"] = "0"
+os.environ["TQDM_FORCE"] = "1"
+from transformers.utils import logging
+logging.set_verbosity_info()
+logger = logging.get_logger()  # optional: get a logger instance if you want to customize
+logger.info("Hugging Face Transformers download started.")
 app = FastAPI()
     prompt += {"role": "user", "content": message}
     return prompt
+def setup():
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    # if torch.backends.mps.is_available():
+    #     device = torch.device("mps")
+    #     x = torch.ones(1, device=device)
+    #     print (x)
+    # else:
+    #     device="cpu"
+    #     print ("MPS device not found.")
+    # device = "auto"
+    # device=torch.device("cpu")
     model_path = "ibm-granite/granite-34b-code-instruct-8k"
     print("Loading tokenizer for model: " + model_path, file=sys.stderr)
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
     print("Loading Model for causal LM for model: " + model_path, file=sys.stderr)
+    # drop device_map if running on CPU
     model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device)
     model.eval()
+    return model, tokenizer, device
+def generate(item: Item, model, tokenizer, device):
+    # device = "cuda" if torch.cuda.is_available() else "cpu"
+    # model_path = "ibm-granite/granite-34b-code-instruct-8k"
+    # print("Loading tokenizer for model: " + model_path, file=sys.stderr)
+    # tokenizer = AutoTokenizer.from_pretrained(model_path, cache_dir="/code/huggingface/transformers")
+    # # drop device_map if running on CPU
+    # print("Loading Model for causal LM for model: " + model_path, file=sys.stderr)
+    # model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device)
+    # model.eval()
+    print("Adapting the input into a template...", file=sys.stderr)
     # change input text as desired
     chat = format_prompt(item.system_prompt, item.prompt, item.history)
     chat = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
     output_text = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
     return output_text
+model, tokenizer, device = setup()
 # model, tokenizer, device = setup()
 @app.post("/generate/")
 async def generate_text(item: Item):
+    # return {"response": generate(item)}
+    return {"response": generate(item, model, tokenizer, device)}
 @app.get("/")
 async def generate_text_root(item: Item):