mknolan commited on
Commit
da1f9eb
·
verified ·
1 Parent(s): b3a024e

Upload InternVL2 implementation

Browse files
Files changed (1) hide show
  1. app_internvl2.py +115 -37
app_internvl2.py CHANGED
@@ -145,17 +145,29 @@ def load_internvl2_model():
145
 
146
  print("Loading InternVL2 model...")
147
  try:
 
 
 
 
 
 
 
148
  # Configure for AWQ quantized model
149
- backend_config = TurbomindEngineConfig(model_format='awq')
 
 
 
150
 
151
- # Create pipeline with non-streaming mode to avoid asyncio conflicts
 
152
  internvl2_pipeline = pipeline(
153
  MODEL_ID,
154
  backend_config=backend_config,
155
  log_level='INFO',
156
  model_name_or_path=None,
157
  backend_name="turbomind",
158
- stream=False # Important: disable streaming to avoid asyncio issues
 
159
  )
160
 
161
  print("InternVL2 model loaded successfully!")
@@ -196,46 +208,112 @@ def analyze_image(image, prompt):
196
  else:
197
  # If somehow it's already a PIL Image
198
  image_pil = image.convert('RGB')
 
 
 
 
199
 
200
- # Completely bypass asyncio by using a dedicated thread for model inference
201
- import threading
202
- import queue
203
-
204
- result_queue = queue.Queue()
205
-
206
- def run_inference_in_thread():
207
  try:
208
- # Run the model in a dedicated thread
209
- response = internvl2_pipeline((prompt, image_pil))
210
- result_text = response.text if hasattr(response, "text") else str(response)
211
- result_queue.put(("success", result_text))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  except Exception as e:
213
- result_queue.put(("error", str(e)))
 
 
214
 
215
- # Start a dedicated thread for inference
216
- print("Running model inference in a dedicated thread")
217
- inference_thread = threading.Thread(target=run_inference_in_thread)
218
- inference_thread.daemon = True # Allow the thread to be terminated when the main program exits
219
- inference_thread.start()
220
 
221
- # Wait for the thread to complete (with timeout)
222
- inference_thread.join(timeout=120) # 2 minute timeout
223
-
224
- if inference_thread.is_alive():
225
- # If the thread is still running after timeout
226
- return "Model inference timed out after 120 seconds. The model might be too slow on this hardware."
227
-
228
- # Get the result from the queue
229
- if not result_queue.empty():
230
- status, result = result_queue.get()
231
- if status == "error":
232
- return f"Error in model inference: {result}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
  else:
234
- elapsed_time = time.time() - start_time
235
- return result
236
- else:
237
- return "Unknown error: Model inference did not produce a result"
238
-
 
239
  except Exception as e:
240
  print(f"Error in image analysis: {str(e)}")
241
  # Try to clean up memory in case of error
 
145
 
146
  print("Loading InternVL2 model...")
147
  try:
148
+ # Force synchronous execution for everything
149
+ import os
150
+ # Set environment variables to force synchronous behavior
151
+ os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
152
+ # Disable asyncio in lmdeploy
153
+ os.environ["LMDEPLOY_DISABLE_ASYNC"] = "1"
154
+
155
  # Configure for AWQ quantized model
156
+ backend_config = TurbomindEngineConfig(
157
+ model_format='awq',
158
+ session_len=2048 # Explicitly set session length
159
+ )
160
 
161
+ # Create a synchronous pipeline to avoid asyncio issues
162
+ # Explicitly set all parameters that might default to async behavior
163
  internvl2_pipeline = pipeline(
164
  MODEL_ID,
165
  backend_config=backend_config,
166
  log_level='INFO',
167
  model_name_or_path=None,
168
  backend_name="turbomind",
169
+ stream=False, # Important: disable streaming
170
+ tensor_parallel=1, # Use single GPU to avoid distributed processing
171
  )
172
 
173
  print("InternVL2 model loaded successfully!")
 
208
  else:
209
  # If somehow it's already a PIL Image
210
  image_pil = image.convert('RGB')
211
+
212
+ # We'll use a completely different approach - multiprocessing
213
+ # This runs the model in a separate process, avoiding any event loop conflicts
214
+ import multiprocessing as mp
215
 
216
+ # Define a function to run in a separate process
217
+ def run_in_process(prompt, image_path, result_queue):
 
 
 
 
 
218
  try:
219
+ # Set environment variables in the subprocess
220
+ import os
221
+ os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
222
+ os.environ["LMDEPLOY_DISABLE_ASYNC"] = "1"
223
+
224
+ # Import libraries inside the process
225
+ from lmdeploy import pipeline, TurbomindEngineConfig
226
+
227
+ # Save the image to a temporary file to pass between processes
228
+ import tempfile
229
+ import torch
230
+
231
+ # Check GPU in subprocess
232
+ print(f"Subprocess GPU available: {torch.cuda.is_available()}")
233
+
234
+ # Configure for AWQ quantized model
235
+ backend_config = TurbomindEngineConfig(
236
+ model_format='awq',
237
+ session_len=2048
238
+ )
239
+
240
+ # Create new pipeline in the subprocess
241
+ model_pipeline = pipeline(
242
+ MODEL_ID,
243
+ backend_config=backend_config,
244
+ log_level='INFO',
245
+ model_name_or_path=None,
246
+ backend_name="turbomind",
247
+ stream=False,
248
+ tensor_parallel=1,
249
+ )
250
+
251
+ # Load the image in the subprocess
252
+ from PIL import Image
253
+ image = Image.open(image_path).convert('RGB')
254
+
255
+ # Run inference
256
+ response = model_pipeline((prompt, image))
257
+ result = response.text if hasattr(response, "text") else str(response)
258
+
259
+ # Put the result in the queue
260
+ result_queue.put(("success", result))
261
+
262
  except Exception as e:
263
+ import traceback
264
+ error_msg = f"Error in subprocess: {str(e)}\n{traceback.format_exc()}"
265
+ result_queue.put(("error", error_msg))
266
 
267
+ # Create a temporary file for the image
268
+ import tempfile
269
+ with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
270
+ temp_path = temp_file.name
271
+ image_pil.save(temp_path)
272
 
273
+ try:
274
+ # Create a process-safe queue
275
+ result_queue = mp.Queue()
276
+
277
+ # Start the process
278
+ print("Starting model inference in a separate process")
279
+ process = mp.Process(
280
+ target=run_in_process,
281
+ args=(prompt, temp_path, result_queue)
282
+ )
283
+
284
+ # Make it a daemon so it terminates when the main process ends
285
+ process.daemon = True
286
+ process.start()
287
+
288
+ # Wait for the process to complete (with timeout)
289
+ process.join(timeout=180) # 3 minute timeout
290
+
291
+ # Delete the temporary file
292
+ try:
293
+ os.unlink(temp_path)
294
+ except:
295
+ pass
296
+
297
+ if process.is_alive():
298
+ # Terminate the process if it's still running after timeout
299
+ process.terminate()
300
+ return "Model inference timed out after 180 seconds. The model might be too slow on this hardware."
301
+
302
+ # Get the result from the queue (non-blocking to avoid hanging)
303
+ if not result_queue.empty():
304
+ status, result = result_queue.get(block=False)
305
+ if status == "error":
306
+ return f"Error in model inference: {result}"
307
+ else:
308
+ elapsed_time = time.time() - start_time
309
+ return result
310
  else:
311
+ return "Unknown error: Model inference process completed but did not produce a result"
312
+
313
+ except Exception as e:
314
+ print(f"Error in multiprocessing: {str(e)}")
315
+ return f"Error setting up multiprocessing: {str(e)}"
316
+
317
  except Exception as e:
318
  print(f"Error in image analysis: {str(e)}")
319
  # Try to clean up memory in case of error