{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "id": "fD24jJxq7t3k" }, "outputs": [], "source": [ "# @title # ⚡ AutoQuant\n", "\n", "# @markdown > 🗣️ [Large Language Model Course](https://github.com/mlabonne/llm-course)\n", "\n", "# @markdown ❤️ Created by [@maximelabonne](https://twitter.com/maximelabonne).\n", "\n", "# @markdown **Usage:** Download the model by **running this cell** and then run the cells corresponding to your quantization methods of interest.\n", "\n", "# @markdown To quantize a 7B model, GGUF only needs a T4 GPU, while the other methods require an A100 GPU.\n", "\n", "# @markdown *See also the [AutoQuantize](https://colab.research.google.com/drive/1Li3USnl3yoYctqJLtYux3LAIy4Bnnv3J) notebook from zainulabideen.*\n", "\n", "# @markdown ---\n", "\n", "# @markdown ## 🤗 Download model (required)\n", "# @markdown `HF_TOKEN` corresponds to the name of the secret that stores your [Hugging Face access token](https://huggingface.co/settings/tokens) in Colab.\n", "\n", "MODEL_ID = \"mlabonne/Zebrafish-7B\" # @param {type:\"string\"}\n", "USERNAME = \"Artples\" # @param {type:\"string\"}\n", "HF_TOKEN = \"HF_TOKEN\" # @param {type:\"string\"}\n", "\n", "MODEL_NAME = MODEL_ID.split('/')[-1]\n", "\n", "# Download model\n", "!git lfs install\n", "!git clone https://huggingface.co/{MODEL_ID}\n", "!pip install -q huggingface_hub\n", "\n", "from huggingface_hub import create_repo, HfApi, ModelCard\n", "from google.colab import userdata, runtime\n", "\n", "# Defined in the secrets tab in Google Colab\n", "hf_token = userdata.get(HF_TOKEN)\n", "api = HfApi()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "NL0yGhbe3EFk" }, "outputs": [], "source": [ "# @title ## 🧩 GGUF\n", "\n", "# @markdown Quantization methods: `q2_k`, `q3_k_l`, `q3_k_m`, `q3_k_s`, `q4_0`, `q4_1`, `q4_k_m`, `q4_k_s`, `q5_0`, `q5_1`, `q5_k_m`, `q5_k_s`, `q6_k`, `q8_0`\n", "\n", "# @markdown Learn more about GGUF and quantization methods in [this article](https://mlabonne.github.io/blog/posts/Quantize_Llama_2_models_using_ggml.html).\n", "\n", "QUANTIZATION_FORMAT = \"q5_k_m\" # @param {type:\"string\"}\n", "QUANTIZATION_METHODS = QUANTIZATION_FORMAT.replace(\" \", \"\").split(\",\")\n", "\n", "# Install llama.cpp\n", "!git clone https://github.com/ggerganov/llama.cpp\n", "!cd llama.cpp && git pull && make clean && LLAMA_CUBLAS=1 make\n", "!pip install -r llama.cpp/requirements.txt\n", "\n", "# Convert to fp16\n", "fp16 = f\"{MODEL_NAME}/{MODEL_NAME.lower()}.fp16.bin\"\n", "!python llama.cpp/convert.py {MODEL_NAME} --outtype f16 --outfile {fp16}\n", "\n", "# Quantize the model for each method in the QUANTIZATION_METHODS list\n", "for method in QUANTIZATION_METHODS:\n", " qtype = f\"{MODEL_NAME}/{MODEL_NAME.lower()}.{method.upper()}.gguf\"\n", " !./llama.cpp/quantize {fp16} {qtype} {method}\n", "\n", "# Create model card\n", "card = ModelCard.load(MODEL_ID)\n", "card.data.tags.append(\"autoquant\")\n", "card.data.tags.append(\"gguf\")\n", "card.save(f'{MODEL_NAME}/README.md')\n", "\n", "# Upload model\n", "create_repo(\n", " repo_id = f\"{USERNAME}/{MODEL_NAME}-GGUF\",\n", " repo_type=\"model\",\n", " exist_ok=True,\n", " token=hf_token\n", ")\n", "api.upload_folder(\n", " folder_path=MODEL_NAME,\n", " repo_id=f\"{USERNAME}/{MODEL_NAME}-GGUF\",\n", " allow_patterns=[\"*.gguf\",\"$.md\"],\n", " token=hf_token\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "id": "OE_R3AXG5Y-F" }, "outputs": [], "source": [ "# @title ## 🧠 GPTQ\n", "\n", "# @markdown Learn more about the GPTQ algorithm in [this article](https://mlabonne.github.io/blog/posts/4_bit_Quantization_with_GPTQ.html).\n", "\n", "# !pip install auto-gptq optimum accelerate\n", "\n", "# from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig\n", "\n", "# BITS = 4 # @param {type:\"integer\"}\n", "# GROUP_SIZE = 128 # @param {type:\"integer\"}\n", "# DAMP_PERCENT = 0.1 # @param {type:\"number\"}\n", "\n", "# # Quantize model\n", "# tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)\n", "# quantization_config = GPTQConfig(bits=BITS, dataset=\"c4\", tokenizer=tokenizer, damp_percent=DAMP_PERCENT)\n", "# model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map=\"auto\", quantization_config=quantization_config, low_cpu_mem_usage=True)\n", "\n", "# Save model and tokenizer\n", "save_folder = MODEL_ID + \"-GPTQ\"\n", "model.save_pretrained(save_folder, use_safetensors=True)\n", "tokenizer.save_pretrained(save_folder)\n", "\n", "# Create model card\n", "card = ModelCard.load(MODEL_ID)\n", "card.data.tags.append(\"autoquant\")\n", "card.data.tags.append(\"gptq\")\n", "card.save(f'{save_folder}/README.md')\n", "\n", "# Upload model\n", "create_repo(\n", " repo_id = f\"{USERNAME}/{MODEL_NAME}-GPTQ\",\n", " repo_type=\"model\",\n", " exist_ok=True,\n", " token=hf_token\n", ")\n", "api.upload_folder(\n", " folder_path=save_folder,\n", " repo_id=f\"{USERNAME}/{MODEL_NAME}-GPTQ\",\n", " token=hf_token\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "id": "ZC9Nsr9u5WhN" }, "outputs": [], "source": [ "# @title # 🦙 ExLlamaV2\n", "\n", "# @markdown Learn more about ExLlamaV2 in [this article](https://mlabonne.github.io/blog/posts/ExLlamaV2_The_Fastest_Library_to_Run%C2%A0LLMs.html).\n", "\n", "BPW = 5.0 # @param {type:\"number\"}\n", "\n", "# Install ExLLamaV2\n", "!git clone https://github.com/turboderp/exllamav2\n", "!pip install -e exllamav2\n", "!cp {MODEL_NAME} base_model\n", "!rm base_mode/*.bin\n", "\n", "# Download dataset\n", "!wget https://huggingface.co/datasets/wikitext/resolve/9a9e482b5987f9d25b3a9b2883fc6cc9fd8071b3/wikitext-103-v1/wikitext-test.parquet\n", "\n", "# Quantize model\n", "save_folder = MODEL_ID + \"-EXL2\"\n", "!mkdir {save_folder}\n", "!python exllamav2/convert.py \\\n", " -i base_model \\\n", " -o {save_folder} \\\n", " -c wikitext-test.parquet \\\n", " -b {BPW}\n", "\n", "# Copy files\n", "!rm -rf quant/out_tensor\n", "!rsync -av --exclude='*.safetensors' --exclude='.*' ./base_model/ ./{save_folder}/\n", "\n", "# Create model card\n", "card = ModelCard.load(MODEL_ID)\n", "card.data.tags.append(\"autoquant\")\n", "card.data.tags.append(\"exl2\")\n", "card.save(f'{save_folder}/README.md')\n", "\n", "# Upload model\n", "create_repo(\n", " repo_id = f\"{USERNAME}/{MODEL_NAME}-{BPW:.1f}bpw-exl2\",\n", " repo_type=\"model\",\n", " exist_ok=True,\n", " token=hf_token\n", ")\n", "api.upload_folder(\n", " folder_path=save_folder,\n", " repo_id=f\"{USERNAME}/{MODEL_NAME}-{BPW:.1f}bpw-exl2\",\n", " token=hf_token\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "id": "MyyUO2Fj3WHt" }, "outputs": [], "source": [ "# @title ## ⚖️ AWQ\n", "\n", "# @markdown See the [AutoAWQ repository](https://github.com/casper-hansen/AutoAWQ) for more information.\n", "\n", "# Install AutoAWQ\n", "!pip install -qqq -U https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.4/autoawq-0.2.4+cu118-cp310-cp310-linux_x86_64.whl\n", "!pip install zstandard\n", "\n", "from awq import AutoAWQForCausalLM\n", "from transformers import AutoTokenizer\n", "\n", "BITS = 4 # @param {type: \"integer\"}\n", "GROUP_SIZE = 128 # @param {type: \"integer\"}\n", "VERSION = \"GEMM\" # @param {type: \"string\"}\n", "ZERO_POINT = True # @param {type: \"boolean\"}\n", "\n", "quant_config = {\n", " \"w_bit\": BITS,\n", " \"q_group_size\": GROUP_SIZE,\n", " \"version\": VERSION,\n", " \"zero_point\": ZERO_POINT\n", "}\n", "save_folder = MODEL_NAME + \"-AWQ\"\n", "\n", "# Quantize model\n", "model = AutoAWQForCausalLM.from_pretrained(MODEL_NAME, safetensors=True, low_cpu_mem_usage=True)\n", "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)\n", "model.quantize(tokenizer, quant_config=quant_config)\n", "\n", "# Save model and tokenizer\n", "model.save_quantized(save_folder)\n", "tokenizer.save_pretrained(save_folder)\n", "\n", "# Create model card\n", "card = ModelCard.load(MODEL_ID)\n", "card.data.tags.append(\"autoquant\")\n", "card.data.tags.append(\"awq\")\n", "card.save(f'{save_folder}/README.md')\n", "\n", "# Upload model\n", "create_repo(\n", " repo_id = f\"{USERNAME}/{MODEL_NAME}-AWQ\",\n", " repo_type=\"model\",\n", " exist_ok=True,\n", " token=hf_token\n", ")\n", "api.upload_folder(\n", " folder_path=save_folder,\n", " repo_id=f\"{USERNAME}/{MODEL_NAME}-AWQ\",\n", " token=hf_token\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cellView": "form", "id": "iEhLsUjcnNR7" }, "outputs": [], "source": [ "# @title ## 🐘 HQQ\n", "\n", "# @markdown See the official [HQQ repository](https://github.com/mobiusml/hqq) for more information.\n", "\n", "!git clone https://github.com/mobiusml/hqq.git\n", "!pip install -e hqq\n", "!python hqq/kernels/setup_cuda.py install\n", "!pip install flash-attn --no-build-isolation\n", "!pip install transformers --upgrade\n", "!num_threads=8; OMP_NUM_THREADS=$num_threads CUDA_VISIBLE_DEVICES=0\n", "\n", "import torch\n", "from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer\n", "from hqq.models.hf.base import AutoHQQHFModel\n", "from hqq.core.quantize import *\n", "\n", "BITS = 2 # @param {type:\"integer\"}\n", "GROUP_SIZE = 128 # @param {type:\"integer\"}\n", "\n", "# Quant config\n", "quant_config = BaseQuantizeConfig(\n", " nbits=BITS,\n", " group_size=GROUP_SIZE\n", ")\n", "\n", "# Quantize model\n", "model = HQQModelForCausalLM.from_pretrained(\n", " MODEL_ID,\n", " cache_dir=\".\",\n", " attn_implementation=\"flash_attention_2\"\n", ")\n", "tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)\n", "model.quantize_model(quant_config=quant_config, device='cuda')\n", "\n", "# Save model and tokenizer\n", "save_folder = MODEL_ID + \"-HQQ\"\n", "model.save_quantized(save_folder)\n", "tokenizer.save_pretrained(save_folder)\n", "\n", "# Create model card\n", "card = ModelCard.load(MODEL_ID)\n", "card.data.tags.append(\"autoquant\")\n", "card.data.tags.append(\"hqq\")\n", "card.save(f'{save_folder}/README.md')\n", "\n", "# Upload model\n", "create_repo(\n", " repo_id = f\"{USERNAME}/{MODEL_NAME}-{BITS}bit-HQQ\",\n", " repo_type=\"model\",\n", " exist_ok=True,\n", " token=hf_token\n", ")\n", "api.upload_folder(\n", " folder_path=save_folder,\n", " repo_id=f\"{USERNAME}/{MODEL_NAME}-{BITS}bit-HQQ\",\n", " token=hf_token\n", ")" ] } ], "metadata": { "accelerator": "GPU", "colab": { "gpuType": "T4", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }