Spaces:

jbilcke-hf
/

tikslop

Running on CPU Upgrade

App Files Files Community

jbilcke-hf HF Staff commited on 16 days ago

Commit

422262e

1 Parent(s): 8e4eac4

fix issue with YAML parsing

Browse files

Files changed (8) hide show

api_core.py +97 -31
assets/config/curated_models.yaml +3 -3
build/web/assets/assets/config/curated_models.yaml +3 -3
build/web/flutter_bootstrap.js +1 -1
build/web/flutter_service_worker.js +2 -2
build/web/index.html +1 -1
docs/for-bots/huggingface/chat-completion.md +734 -0
docs/for-bots/huggingface/text-generation.md +493 -0

api_core.py CHANGED Viewed

@@ -319,6 +319,7 @@ class VideoGenerationAPI:
                            model_override: Optional[str] = None) -> str:
         """
         Helper method to generate text using the appropriate client and configuration.
         Args:
             prompt: The prompt to generate text from
@@ -333,37 +334,83 @@ class VideoGenerationAPI:
         # Get the appropriate client
         client = self._get_inference_client(llm_config)
-        # For third-party providers, we don't need to specify model in text_generation
-        # as it's already configured in the client
-        if llm_config and llm_config.get('provider') != 'huggingface':
-            response = await asyncio.get_event_loop().run_in_executor(
-                None,
-                lambda: client.text_generation(
-                    prompt,
-                    max_new_tokens=max_new_tokens,
-                    temperature=temperature
-                )
-            )
         else:
-            # For HuggingFace models, we need to specify the model
-            if model_override:
-                model_to_use = model_override
-            elif llm_config:
-                model_to_use = llm_config.get('model', TEXT_MODEL)
             else:
-                model_to_use = TEXT_MODEL
-            response = await asyncio.get_event_loop().run_in_executor(
-                None,
-                lambda: client.text_generation(
-                    prompt,
-                    model=model_to_use,
-                    max_new_tokens=max_new_tokens,
-                    temperature=temperature
                 )
-            )
-        return response
     def _add_event(self, video_id: str, event: Dict[str, Any]):
@@ -486,16 +533,35 @@ Describe the first scene/shot for: "{query}".
 title: \""""
             try:
-                response = await self._generate_text(
                     prompt,
                     llm_config=llm_config,
                     max_new_tokens=200,
                     temperature=temperature
                 )
-                response_text = re.sub(r'^\s*\.\s*\n', '', f"title: \"{response.strip()}")
-                sanitized_yaml = sanitize_yaml_response(response_text)
                 try:
                     result = yaml.safe_load(sanitized_yaml)
                 except yaml.YAMLError as e:

                            model_override: Optional[str] = None) -> str:
         """
         Helper method to generate text using the appropriate client and configuration.
+        Tries chat_completion first (modern standard), falls back to text_generation.
         Args:
             prompt: The prompt to generate text from
         # Get the appropriate client
         client = self._get_inference_client(llm_config)
+        # Determine the model to use
+        if model_override:
+            model_to_use = model_override
+        elif llm_config:
+            model_to_use = llm_config.get('model', TEXT_MODEL)
         else:
+            model_to_use = TEXT_MODEL
+        # Try chat_completion first (modern standard, more widely supported)
+        try:
+            messages = [{"role": "user", "content": prompt}]
+            if llm_config and llm_config.get('provider') != 'huggingface':
+                # For third-party providers
+                completion = await asyncio.get_event_loop().run_in_executor(
+                    None,
+                    lambda: client.chat.completions.create(
+                        messages=messages,
+                        max_tokens=max_new_tokens,
+                        temperature=temperature
+                    )
+                )
             else:
+                # For HuggingFace models, specify the model
+                completion = await asyncio.get_event_loop().run_in_executor(
+                    None,
+                    lambda: client.chat.completions.create(
+                        model=model_to_use,
+                        messages=messages,
+                        max_tokens=max_new_tokens,
+                        temperature=temperature
+                    )
                 )
+            # Extract the generated text from the chat completion response
+            return completion.choices[0].message.content
+        except Exception as e:
+            error_message = str(e).lower()
+            # Check if the error is related to task compatibility or API not supported
+            if ("not supported for task" in error_message or
+                "conversational" in error_message or
+                "chat" in error_message):
+                logger.info(f"chat_completion not supported, falling back to text_generation: {e}")
+                # Fall back to text_generation API
+                try:
+                    if llm_config and llm_config.get('provider') != 'huggingface':
+                        # For third-party providers
+                        response = await asyncio.get_event_loop().run_in_executor(
+                            None,
+                            lambda: client.text_generation(
+                                prompt,
+                                max_new_tokens=max_new_tokens,
+                                temperature=temperature
+                            )
+                        )
+                    else:
+                        # For HuggingFace models, specify the model
+                        response = await asyncio.get_event_loop().run_in_executor(
+                            None,
+                            lambda: client.text_generation(
+                                prompt,
+                                model=model_to_use,
+                                max_new_tokens=max_new_tokens,
+                                temperature=temperature
+                            )
+                        )
+                    return response
+                except Exception as text_error:
+                    logger.error(f"Both chat_completion and text_generation failed: {text_error}")
+                    raise text_error
+            else:
+                # Re-raise the original error if it's not a task compatibility issue
+                logger.error(f"chat_completion failed with non-compatibility error: {e}")
+                raise e
     def _add_event(self, video_id: str, event: Dict[str, Any]):
 title: \""""
             try:
+                raw_yaml_str = await self._generate_text(
                     prompt,
                     llm_config=llm_config,
                     max_new_tokens=200,
                     temperature=temperature
                 )
+                raw_yaml_str = raw_yaml_str.strip()
+                #logger.info(f"search_video(): raw_yaml_str = {raw_yaml_str}")
+                if raw_yaml_str.startswith("```yaml"):
+                    # Remove the "```yaml" at the beginning and closing ```
+                    raw_yaml_str = raw_yaml_str[7:]  # Remove "```yaml" (7 characters)
+                    if raw_yaml_str.endswith("```"):
+                        raw_yaml_str = raw_yaml_str[:-3]  # Remove closing ```
+                    raw_yaml_str = raw_yaml_str.strip()
+                elif raw_yaml_str.startswith("```"):
+                    # Remove the "```" at the beginning and closing ```
+                    raw_yaml_str = raw_yaml_str[3:]  # Remove opening ```
+                    if raw_yaml_str.endswith("```"):
+                        raw_yaml_str = raw_yaml_str[:-3]  # Remove closing ```
+                    raw_yaml_str = raw_yaml_str.strip()
+                else:
+                    raw_yaml_str = re.sub(r'^\s*\.\s*\n', '', f"title: \"{raw_yaml_str}")
+                sanitized_yaml = sanitize_yaml_response(raw_yaml_str)
+                #logger.info(f"search_video(): sanitized_yaml = {sanitized_yaml}")
                 try:
                     result = yaml.safe_load(sanitized_yaml)
                 except yaml.YAMLError as e:

assets/config/curated_models.yaml CHANGED Viewed

@@ -74,9 +74,9 @@ models:
     display_name: Qwen3 235B A22B
     num_of_parameters: 235B
-  - model_id: deepseek-ai/DeepSeek-V3-0324
-    display_name: DeepSeek V3
-    num_of_parameters: 685B
   - model_id: moonshotai/Kimi-K2-Instruct
     display_name: Kimi K2

     display_name: Qwen3 235B A22B
     num_of_parameters: 235B
+  #- model_id: deepseek-ai/DeepSeek-V3-0324
+  #  display_name: DeepSeek V3
+  #  num_of_parameters: 685B
   - model_id: moonshotai/Kimi-K2-Instruct
     display_name: Kimi K2

build/web/assets/assets/config/curated_models.yaml CHANGED Viewed

@@ -74,9 +74,9 @@ models:
     display_name: Qwen3 235B A22B
     num_of_parameters: 235B
-  - model_id: deepseek-ai/DeepSeek-V3-0324
-    display_name: DeepSeek V3
-    num_of_parameters: 685B
   - model_id: moonshotai/Kimi-K2-Instruct
     display_name: Kimi K2

     display_name: Qwen3 235B A22B
     num_of_parameters: 235B
+  #- model_id: deepseek-ai/DeepSeek-V3-0324
+  #  display_name: DeepSeek V3
+  #  num_of_parameters: 685B
   - model_id: moonshotai/Kimi-K2-Instruct
     display_name: Kimi K2

build/web/flutter_bootstrap.js CHANGED Viewed

@@ -38,6 +38,6 @@ _flutter.buildConfig = {"engineRevision":"1c9c20e7c3dd48c66f400a24d48ea806b4ab31
 _flutter.loader.load({
   serviceWorkerSettings: {
-    serviceWorkerVersion: "485624187"
   }
 });

 _flutter.loader.load({
   serviceWorkerSettings: {
+    serviceWorkerVersion: "3912302714"
   }
 });

build/web/flutter_service_worker.js CHANGED Viewed

@@ -3,7 +3,7 @@ const MANIFEST = 'flutter-app-manifest';
 const TEMP = 'flutter-temp-cache';
 const CACHE_NAME = 'flutter-app-cache';
-const RESOURCES = {"flutter_bootstrap.js": "67612d11664e1438c6f25ef6f2340c5f",
 "version.json": "68350cac7987de2728345c72918dd067",
 "tikslop.png": "570e1db759046e2d224fef729983634e",
 "index.html": "3a7029b3672560e7938aab6fa4d30a46",
@@ -28,7 +28,7 @@ const RESOURCES = {"flutter_bootstrap.js": "67612d11664e1438c6f25ef6f2340c5f",
 "assets/assets/ads/smolagents.gif": "45338af5a4d440b707d02f364be8195c",
 "assets/assets/ads/README.md": "1959fb6b85a966348396f2f0f9c3f32a",
 "assets/assets/ads/lerobot.gif": "0f90b2fc4d15eefb5572363724d6d925",
-"assets/assets/config/curated_models.yaml": "32e4a868fcaf5617bccfb5a1a26f9690",
 "assets/assets/config/README.md": "07a87720dd00dd1ca98c9d6884440e31",
 "assets/assets/config/custom.yaml": "52bd30aa4d8b980626a5eb02d0871c01",
 "assets/assets/config/default.yaml": "9ca1d05d06721c2b6f6382a1ba40af48",

 const TEMP = 'flutter-temp-cache';
 const CACHE_NAME = 'flutter-app-cache';
+const RESOURCES = {"flutter_bootstrap.js": "f833cb89d68c8ddba5bc70cec281205c",
 "version.json": "68350cac7987de2728345c72918dd067",
 "tikslop.png": "570e1db759046e2d224fef729983634e",
 "index.html": "3a7029b3672560e7938aab6fa4d30a46",
 "assets/assets/ads/smolagents.gif": "45338af5a4d440b707d02f364be8195c",
 "assets/assets/ads/README.md": "1959fb6b85a966348396f2f0f9c3f32a",
 "assets/assets/ads/lerobot.gif": "0f90b2fc4d15eefb5572363724d6d925",
+"assets/assets/config/curated_models.yaml": "94e54843953b4f90c454cd8e5a3176fb",
 "assets/assets/config/README.md": "07a87720dd00dd1ca98c9d6884440e31",
 "assets/assets/config/custom.yaml": "52bd30aa4d8b980626a5eb02d0871c01",
 "assets/assets/config/default.yaml": "9ca1d05d06721c2b6f6382a1ba40af48",

build/web/index.html CHANGED Viewed

@@ -156,7 +156,7 @@
   </script>
   <!-- Add version parameter for cache busting -->
-  <script src="flutter_bootstrap.js?v=1753274269" async></script>
   <!-- Add cache busting script -->
   <script>

   </script>
   <!-- Add version parameter for cache busting -->
+  <script src="flutter_bootstrap.js?v=1753281547" async></script>
   <!-- Add cache busting script -->
   <script>

docs/for-bots/huggingface/chat-completion.md ADDED Viewed

	@@ -0,0 +1,734 @@

+[](#chat-completion)Chat Completion
+-----------------------------------
+Generate a response given a list of messages in a conversational context, supporting both conversational Language Models (LLMs) and conversational Vision-Language Models (VLMs). This is a subtask of [`text-generation`](https://huggingface.co/docs/inference-providers/tasks/text-generation) and [`image-text-to-text`](https://huggingface.co/docs/inference-providers/tasks/image-text-to-text).
+### [](#recommended-models)Recommended models
+#### [](#conversational-large-language-models-llms)Conversational Large Language Models (LLMs)
+*   [google/gemma-2-2b-it](https://huggingface.co/google/gemma-2-2b-it): A text-generation model trained to follow instructions.
+*   [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B): Smaller variant of one of the most powerful models.
+*   [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct): Very powerful text generation model trained to follow instructions.
+*   [microsoft/phi-4](https://huggingface.co/microsoft/phi-4): Powerful text generation model by Microsoft.
+*   [Qwen/Qwen2.5-7B-Instruct-1M](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-1M): Strong conversational model that supports very long instructions.
+*   [Qwen/Qwen2.5-Coder-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct): Text generation model used to write code.
+*   [deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1): Powerful reasoning based open large language model.
+#### [](#conversational-vision-language-models-vlms)Conversational Vision-Language Models (VLMs)
+*   [Qwen/Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct): Strong image-text-to-text model.
+Explore all available models and find the one that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=image-text-to-text&sort=trending).
+### [](#api-playground)API Playground
+For Chat Completion models, we provide an interactive UI Playground for easier testing:
+*   Quickly iterate on your prompts from the UI.
+*   Set and override system, assistant and user messages.
+*   Browse and select models currently available on the Inference API.
+*   Compare the output of two models side-by-side.
+*   Adjust requests parameters from the UI.
+*   Easily switch between UI view and code snippets.
+[![](https://cdn-uploads.huggingface.co/production/uploads/5f17f0a0925b9863e28ad517/9_Tgf0Tv65srhBirZQMTp.png)](https://huggingface.co/playground)
+Access the Inference UI Playground and start exploring: [https://huggingface.co/playground](https://huggingface.co/playground)
+### [](#using-the-api)Using the API
+The API supports:
+*   Using the chat completion API compatible with the OpenAI SDK.
+*   Using grammars, constraints, and tools.
+*   Streaming the output
+#### [](#code-snippet-example-for-conversational-llms)Code snippet example for conversational LLMs
+Language
+Python JavaScript cURL
+Client
+huggingface\_hub requests openai
+Provider
+Featherless Nscale
++9
+Settings
+Settings
+Settings
+Copied
+import os
+from huggingface\_hub import InferenceClient
+client = InferenceClient(
+    provider="featherless-ai",
+    api\_key=os.environ\["HF\_TOKEN"\],
+)
+completion = client.chat.completions.create(
+    model="meta-llama/Llama-3.3-70B-Instruct",
+    messages=\[
+        {
+            "role": "user",
+            "content": "What is the capital of France?"
+        }
+    \],
+)
+print(completion.choices\[0\].message)
+#### [](#code-snippet-example-for-conversational-vlms)Code snippet example for conversational VLMs
+Language
+Python JavaScript cURL
+Client
+huggingface\_hub requests openai
+Provider
+Fireworks Featherless
++10
+Settings
+Settings
+Settings
+Copied
+import os
+from huggingface\_hub import InferenceClient
+client = InferenceClient(
+    provider="fireworks-ai",
+    api\_key=os.environ\["HF\_TOKEN"\],
+)
+completion = client.chat.completions.create(
+    model="meta-llama/Llama-4-Scout-17B-16E-Instruct",
+    messages=\[
+        {
+            "role": "user",
+            "content": \[
+                {
+                    "type": "text",
+                    "text": "Describe this image in one sentence."
+                },
+                {
+                    "type": "image\_url",
+                    "image\_url": {
+                        "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+                    }
+                }
+            \]
+        }
+    \],
+)
+print(completion.choices\[0\].message)
+### [](#api-specification)API specification
+#### [](#request)Request
+Headers
+**authorization**
+_string_
+Authentication header in the form `'Bearer: hf_****'` when `hf_****` is a personal user access token with “Inference Providers” permission. You can generate one from [your settings page](https://huggingface.co/settings/tokens/new?ownUserPermissions=inference.serverless.write&tokenType=fineGrained).
+Payload
+**frequency\_penalty**
+_number_
+Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model’s likelihood to repeat the same line verbatim.
+**logprobs**
+_boolean_
+Whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the content of message.
+**max\_tokens**
+_integer_
+The maximum number of tokens that can be generated in the chat completion.
+**messages\***
+_object\[\]_
+A list of messages comprising the conversation so far.
+         **(#1)**
+_unknown_
+One of the following:
+                 **(#1)**
+_object_
+                        **content\***
+_unknown_
+One of the following:
+                                 **(#1)**
+_string_
+                                 **(#2)**
+_object\[\]_
+                                         **(#1)**
+_object_
+                                                **text\***
+_string_
+                                                **type\***
+_enum_
+Possible values: text.
+                                         **(#2)**
+_object_
+                                                **image\_url\***
+_object_
+                                                        **url\***
+_string_
+                                                **type\***
+_enum_
+Possible values: image\_url.
+                 **(#2)**
+_object_
+                        **tool\_calls\***
+_object\[\]_
+                                **function\***
+_object_
+                                        **parameters\***
+_unknown_
+                                        **description**
+_string_
+                                        **name\***
+_string_
+                                **id\***
+_string_
+                                **type\***
+_string_
+         **(#2)**
+_object_
+                **name**
+_string_
+                **role\***
+_string_
+**presence\_penalty**
+_number_
+Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model’s likelihood to talk about new topics
+**response\_format**
+_unknown_
+One of the following:
+         **(#1)**
+_object_
+                **type\***
+_enum_
+Possible values: text.
+         **(#2)**
+_object_
+                **type\***
+_enum_
+Possible values: json\_schema.
+                **json\_schema\***
+_object_
+                        **name\***
+_string_
+The name of the response format.
+                        **description**
+_string_
+A description of what the response format is for, used by the model to determine how to respond in the format.
+                        **schema**
+_object_
+The schema for the response format, described as a JSON Schema object. Learn how to build JSON schemas [here](https://json-schema.org/).
+                        **strict**
+_boolean_
+Whether to enable strict schema adherence when generating the output. If set to true, the model will always follow the exact schema defined in the `schema` field.
+         **(#3)**
+_object_
+                **type\***
+_enum_
+Possible values: json\_object.
+**seed**
+_integer_
+**stop**
+_string\[\]_
+Up to 4 sequences where the API will stop generating further tokens.
+**stream**
+_boolean_
+**stream\_options**
+_object_
+        **include\_usage**
+_boolean_
+If set, an additional chunk will be streamed before the data: \[DONE\] message. The usage field on this chunk shows the token usage statistics for the entire request, and the choices field will always be an empty array. All other chunks will also include a usage field, but with a null value.
+**temperature**
+_number_
+What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. We generally recommend altering this or `top_p` but not both.
+**tool\_choice**
+_unknown_
+One of the following:
+         **(#1)**
+_enum_
+Possible values: auto.
+         **(#2)**
+_enum_
+Possible values: none.
+         **(#3)**
+_enum_
+Possible values: required.
+         **(#4)**
+_object_
+                **function\***
+_object_
+                        **name\***
+_string_
+**tool\_prompt**
+_string_
+A prompt to be appended before the tools
+**tools**
+_object\[\]_
+A list of tools the model may call. Currently, only functions are supported as a tool. Use this to provide a list of functions the model may generate JSON inputs for.
+        **function\***
+_object_
+                **parameters\***
+_unknown_
+                **description**
+_string_
+                **name\***
+_string_
+        **type\***
+_string_
+**top\_logprobs**
+_integer_
+An integer between 0 and 5 specifying the number of most likely tokens to return at each token position, each with an associated log probability. logprobs must be set to true if this parameter is used.
+**top\_p**
+_number_
+An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top\_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.
+#### [](#response)Response
+Output type depends on the `stream` input parameter. If `stream` is `false` (default), the response will be a JSON object with the following fields:
+Body
+**choices**
+_object\[\]_
+        **finish\_reason**
+_string_
+        **index**
+_integer_
+        **logprobs**
+_object_
+                **content**
+_object\[\]_
+                        **logprob**
+_number_
+                        **token**
+_string_
+                        **top\_logprobs**
+_object\[\]_
+                                **logprob**
+_number_
+                                **token**
+_string_
+        **message**
+_unknown_
+One of the following:
+                 **(#1)**
+_object_
+                        **content**
+_string_
+                        **role**
+_string_
+                        **tool\_call\_id**
+_string_
+                 **(#2)**
+_object_
+                        **role**
+_string_
+                        **tool\_calls**
+_object\[\]_
+                                **function**
+_object_
+                                        **arguments**
+_string_
+                                        **description**
+_string_
+                                        **name**
+_string_
+                                **id**
+_string_
+                                **type**
+_string_
+**created**
+_integer_
+**id**
+_string_
+**model**
+_string_
+**system\_fingerprint**
+_string_
+**usage**
+_object_
+        **completion\_tokens**
+_integer_
+        **prompt\_tokens**
+_integer_
+        **total\_tokens**
+_integer_
+If `stream` is `true`, generated tokens are returned as a stream, using Server-Sent Events (SSE). For more information about streaming, check out [this guide](https://huggingface.co/docs/text-generation-inference/conceptual/streaming).
+Body
+**choices**
+_object\[\]_
+        **delta**
+_unknown_
+One of the following:
+                 **(#1)**
+_object_
+                        **content**
+_string_
+                        **role**
+_string_
+                        **tool\_call\_id**
+_string_
+                 **(#2)**
+_object_
+                        **role**
+_string_
+                        **tool\_calls**
+_object\[\]_
+                                **function**
+_object_
+                                        **arguments**
+_string_
+                                        **name**
+_string_
+                                **id**
+_string_
+                                **index**
+_integer_
+                                **type**
+_string_
+        **finish\_reason**
+_string_
+        **index**
+_integer_
+        **logprobs**
+_object_
+                **content**
+_object\[\]_
+                        **logprob**
+_number_
+                        **token**
+_string_
+                        **top\_logprobs**
+_object\[\]_
+                                **logprob**
+_number_
+                                **token**
+_string_
+**created**
+_integer_
+**id**
+_string_
+**model**
+_string_
+**system\_fingerprint**
+_string_
+**usage**
+_object_
+        **completion\_tokens**
+_integer_
+        **prompt\_tokens**
+_integer_
+        **total\_tokens**
+_integer_
+[< \> Update on GitHub](https://github.com/huggingface/hub-docs/blob/main/docs/inference-providers/tasks/chat-completion.md)

docs/for-bots/huggingface/text-generation.md ADDED Viewed

	@@ -0,0 +1,493 @@

+[](#text-generation)Text Generation
+-----------------------------------
+Generate text based on a prompt.
+If you are interested in a Chat Completion task, which generates a response based on a list of messages, check out the [`chat-completion`](./chat_completion) task.
+For more details about the `text-generation` task, check out its [dedicated page](https://huggingface.co/tasks/text-generation)! You will find examples and related materials.
+### [](#recommended-models)Recommended models
+*   [google/gemma-2-2b-it](https://huggingface.co/google/gemma-2-2b-it): A text-generation model trained to follow instructions.
+*   [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B): Smaller variant of one of the most powerful models.
+*   [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct): Very powerful text generation model trained to follow instructions.
+*   [microsoft/phi-4](https://huggingface.co/microsoft/phi-4): Powerful text generation model by Microsoft.
+*   [Qwen/Qwen2.5-7B-Instruct-1M](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-1M): Strong conversational model that supports very long instructions.
+*   [Qwen/Qwen2.5-Coder-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct): Text generation model used to write code.
+*   [deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1): Powerful reasoning based open large language model.
+Explore all available models and find the one that suits you best [here](https://huggingface.co/models?inference=warm&pipeline_tag=text-generation&sort=trending).
+### [](#using-the-api)Using the API
+Language
+Python JavaScript cURL
+Client
+huggingface\_hub requests openai
+Provider
+Featherless Together AI
+Settings
+Settings
+Settings
+Copied
+import os
+from huggingface\_hub import InferenceClient
+client = InferenceClient(
+    provider="featherless-ai",
+    api\_key=os.environ\["HF\_TOKEN"\],
+)
+completion = client.chat.completions.create(
+    model="mistralai/Magistral-Small-2506",
+    messages="\\"Can you please let us know more details about your \\"",
+)
+print(completion.choices\[0\].message)
+### [](#api-specification)API specification
+#### [](#request)Request
+Headers
+**authorization**
+_string_
+Authentication header in the form `'Bearer: hf_****'` when `hf_****` is a personal user access token with “Inference Providers” permission. You can generate one from [your settings page](https://huggingface.co/settings/tokens/new?ownUserPermissions=inference.serverless.write&tokenType=fineGrained).
+Payload
+**inputs\***
+_string_
+**parameters**
+_object_
+        **adapter\_id**
+_string_
+Lora adapter id
+        **best\_of**
+_integer_
+Generate best\_of sequences and return the one if the highest token logprobs.
+        **decoder\_input\_details**
+_boolean_
+Whether to return decoder input token logprobs and ids.
+        **details**
+_boolean_
+Whether to return generation details.
+        **do\_sample**
+_boolean_
+Activate logits sampling.
+        **frequency\_penalty**
+_number_
+The parameter for frequency penalty. 1.0 means no penalty Penalize new tokens based on their existing frequency in the text so far, decreasing the model’s likelihood to repeat the same line verbatim.
+        **grammar**
+_unknown_
+One of the following:
+                 **(#1)**
+_object_
+                        **type\***
+_enum_
+Possible values: json.
+                        **value\***
+_unknown_
+A string that represents a [JSON Schema](https://json-schema.org/). JSON Schema is a declarative language that allows to annotate JSON documents with types and descriptions.
+                 **(#2)**
+_object_
+                        **type\***
+_enum_
+Possible values: regex.
+                        **value\***
+_string_
+                 **(#3)**
+_object_
+                        **type\***
+_enum_
+Possible values: json\_schema.
+                        **value\***
+_object_
+                                **name**
+_string_
+Optional name identifier for the schema
+                                **schema\***
+_unknown_
+The actual JSON schema definition
+        **max\_new\_tokens**
+_integer_
+Maximum number of tokens to generate.
+        **repetition\_penalty**
+_number_
+The parameter for repetition penalty. 1.0 means no penalty. See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+        **return\_full\_text**
+_boolean_
+Whether to prepend the prompt to the generated text
+        **seed**
+_integer_
+Random sampling seed.
+        **stop**
+_string\[\]_
+Stop generating tokens if a member of `stop` is generated.
+        **temperature**
+_number_
+The value used to module the logits distribution.
+        **top\_k**
+_integer_
+The number of highest probability vocabulary tokens to keep for top-k-filtering.
+        **top\_n\_tokens**
+_integer_
+The number of highest probability vocabulary tokens to keep for top-n-filtering.
+        **top\_p**
+_number_
+Top-p value for nucleus sampling.
+        **truncate**
+_integer_
+Truncate inputs tokens to the given size.
+        **typical\_p**
+_number_
+Typical Decoding mass See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information.
+        **watermark**
+_boolean_
+Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226).
+**stream**
+_boolean_
+#### [](#response)Response
+Output type depends on the `stream` input parameter. If `stream` is `false` (default), the response will be a JSON object with the following fields:
+Body
+**details**
+_object_
+        **best\_of\_sequences**
+_object\[\]_
+                **finish\_reason**
+_enum_
+Possible values: length, eos\_token, stop\_sequence.
+                **generated\_text**
+_string_
+                **generated\_tokens**
+_integer_
+                **prefill**
+_object\[\]_
+                        **id**
+_integer_
+                        **logprob**
+_number_
+                        **text**
+_string_
+                **seed**
+_integer_
+                **tokens**
+_object\[\]_
+                        **id**
+_integer_
+                        **logprob**
+_number_
+                        **special**
+_boolean_
+                        **text**
+_string_
+                **top\_tokens**
+_array\[\]_
+                        **id**
+_integer_
+                        **logprob**
+_number_
+                        **special**
+_boolean_
+                        **text**
+_string_
+        **finish\_reason**
+_enum_
+Possible values: length, eos\_token, stop\_sequence.
+        **generated\_tokens**
+_integer_
+        **prefill**
+_object\[\]_
+                **id**
+_integer_
+                **logprob**
+_number_
+                **text**
+_string_
+        **seed**
+_integer_
+        **tokens**
+_object\[\]_
+                **id**
+_integer_
+                **logprob**
+_number_
+                **special**
+_boolean_
+                **text**
+_string_
+        **top\_tokens**
+_array\[\]_
+                **id**
+_integer_
+                **logprob**
+_number_
+                **special**
+_boolean_
+                **text**
+_string_
+**generated\_text**
+_string_
+If `stream` is `true`, generated tokens are returned as a stream, using Server-Sent Events (SSE). For more information about streaming, check out [this guide](https://huggingface.co/docs/text-generation-inference/conceptual/streaming).
+Body
+**details**
+_object_
+        **finish\_reason**
+_enum_
+Possible values: length, eos\_token, stop\_sequence.
+        **generated\_tokens**
+_integer_
+        **input\_length**
+_integer_
+        **seed**
+_integer_
+**generated\_text**
+_string_
+**index**
+_integer_
+**token**
+_object_
+        **id**
+_integer_
+        **logprob**
+_number_
+        **special**
+_boolean_
+        **text**
+_string_
+**top\_tokens**
+_object\[\]_
+        **id**
+_integer_
+        **logprob**
+_number_
+        **special**
+_boolean_
+        **text**
+_string_
+[< \> Update on GitHub](https://github.com/huggingface/hub-docs/blob/main/docs/inference-providers/tasks/text-generation.md)
+Chat Completion