Spaces:

mknolan
/

cursor_slides_internvl2

Paused

App Files Files Community

mknolan commited on Mar 15

Commit

da1f9eb

verified ·

1 Parent(s): b3a024e

Upload InternVL2 implementation

Browse files

Files changed (1) hide show

app_internvl2.py +115 -37

app_internvl2.py CHANGED Viewed

@@ -145,17 +145,29 @@ def load_internvl2_model():
     print("Loading InternVL2 model...")
     try:
         # Configure for AWQ quantized model
-        backend_config = TurbomindEngineConfig(model_format='awq')
-        # Create pipeline with non-streaming mode to avoid asyncio conflicts
         internvl2_pipeline = pipeline(
             MODEL_ID,
             backend_config=backend_config,
             log_level='INFO',
             model_name_or_path=None,
             backend_name="turbomind",
-            stream=False  # Important: disable streaming to avoid asyncio issues
         )
         print("InternVL2 model loaded successfully!")
@@ -196,46 +208,112 @@ def analyze_image(image, prompt):
         else:
             # If somehow it's already a PIL Image
             image_pil = image.convert('RGB')
-        # Completely bypass asyncio by using a dedicated thread for model inference
-        import threading
-        import queue
-        result_queue = queue.Queue()
-        def run_inference_in_thread():
             try:
-                # Run the model in a dedicated thread
-                response = internvl2_pipeline((prompt, image_pil))
-                result_text = response.text if hasattr(response, "text") else str(response)
-                result_queue.put(("success", result_text))
             except Exception as e:
-                result_queue.put(("error", str(e)))
-        # Start a dedicated thread for inference
-        print("Running model inference in a dedicated thread")
-        inference_thread = threading.Thread(target=run_inference_in_thread)
-        inference_thread.daemon = True  # Allow the thread to be terminated when the main program exits
-        inference_thread.start()
-        # Wait for the thread to complete (with timeout)
-        inference_thread.join(timeout=120)  # 2 minute timeout
-        if inference_thread.is_alive():
-            # If the thread is still running after timeout
-            return "Model inference timed out after 120 seconds. The model might be too slow on this hardware."
-        # Get the result from the queue
-        if not result_queue.empty():
-            status, result = result_queue.get()
-            if status == "error":
-                return f"Error in model inference: {result}"
             else:
-                elapsed_time = time.time() - start_time
-                return result
-        else:
-            return "Unknown error: Model inference did not produce a result"
     except Exception as e:
         print(f"Error in image analysis: {str(e)}")
         # Try to clean up memory in case of error

     print("Loading InternVL2 model...")
     try:
+        # Force synchronous execution for everything
+        import os
+        # Set environment variables to force synchronous behavior
+        os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+        # Disable asyncio in lmdeploy
+        os.environ["LMDEPLOY_DISABLE_ASYNC"] = "1"
         # Configure for AWQ quantized model
+        backend_config = TurbomindEngineConfig(
+            model_format='awq',
+            session_len=2048  # Explicitly set session length
+        )
+        # Create a synchronous pipeline to avoid asyncio issues
+        # Explicitly set all parameters that might default to async behavior
         internvl2_pipeline = pipeline(
             MODEL_ID,
             backend_config=backend_config,
             log_level='INFO',
             model_name_or_path=None,
             backend_name="turbomind",
+            stream=False,  # Important: disable streaming
+            tensor_parallel=1,  # Use single GPU to avoid distributed processing
         )
         print("InternVL2 model loaded successfully!")
         else:
             # If somehow it's already a PIL Image
             image_pil = image.convert('RGB')
+        # We'll use a completely different approach - multiprocessing
+        # This runs the model in a separate process, avoiding any event loop conflicts
+        import multiprocessing as mp
+        # Define a function to run in a separate process
+        def run_in_process(prompt, image_path, result_queue):
             try:
+                # Set environment variables in the subprocess
+                import os
+                os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+                os.environ["LMDEPLOY_DISABLE_ASYNC"] = "1"
+                # Import libraries inside the process
+                from lmdeploy import pipeline, TurbomindEngineConfig
+                # Save the image to a temporary file to pass between processes
+                import tempfile
+                import torch
+                # Check GPU in subprocess
+                print(f"Subprocess GPU available: {torch.cuda.is_available()}")
+                # Configure for AWQ quantized model
+                backend_config = TurbomindEngineConfig(
+                    model_format='awq',
+                    session_len=2048
+                )
+                # Create new pipeline in the subprocess
+                model_pipeline = pipeline(
+                    MODEL_ID,
+                    backend_config=backend_config,
+                    log_level='INFO',
+                    model_name_or_path=None,
+                    backend_name="turbomind",
+                    stream=False,
+                    tensor_parallel=1,
+                )
+                # Load the image in the subprocess
+                from PIL import Image
+                image = Image.open(image_path).convert('RGB')
+                # Run inference
+                response = model_pipeline((prompt, image))
+                result = response.text if hasattr(response, "text") else str(response)
+                # Put the result in the queue
+                result_queue.put(("success", result))
             except Exception as e:
+                import traceback
+                error_msg = f"Error in subprocess: {str(e)}\n{traceback.format_exc()}"
+                result_queue.put(("error", error_msg))
+        # Create a temporary file for the image
+        import tempfile
+        with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
+            temp_path = temp_file.name
+            image_pil.save(temp_path)
+        try:
+            # Create a process-safe queue
+            result_queue = mp.Queue()
+            # Start the process
+            print("Starting model inference in a separate process")
+            process = mp.Process(
+                target=run_in_process,
+                args=(prompt, temp_path, result_queue)
+            )
+            # Make it a daemon so it terminates when the main process ends
+            process.daemon = True
+            process.start()
+            # Wait for the process to complete (with timeout)
+            process.join(timeout=180)  # 3 minute timeout
+            # Delete the temporary file
+            try:
+                os.unlink(temp_path)
+            except:
+                pass
+            if process.is_alive():
+                # Terminate the process if it's still running after timeout
+                process.terminate()
+                return "Model inference timed out after 180 seconds. The model might be too slow on this hardware."
+            # Get the result from the queue (non-blocking to avoid hanging)
+            if not result_queue.empty():
+                status, result = result_queue.get(block=False)
+                if status == "error":
+                    return f"Error in model inference: {result}"
+                else:
+                    elapsed_time = time.time() - start_time
+                    return result
             else:
+                return "Unknown error: Model inference process completed but did not produce a result"
+        except Exception as e:
+            print(f"Error in multiprocessing: {str(e)}")
+            return f"Error setting up multiprocessing: {str(e)}"
     except Exception as e:
         print(f"Error in image analysis: {str(e)}")
         # Try to clean up memory in case of error