Upload InternVL2 implementation
Browse files- app_internvl2.py +115 -37
app_internvl2.py
CHANGED
@@ -145,17 +145,29 @@ def load_internvl2_model():
|
|
145 |
|
146 |
print("Loading InternVL2 model...")
|
147 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
# Configure for AWQ quantized model
|
149 |
-
backend_config = TurbomindEngineConfig(
|
|
|
|
|
|
|
150 |
|
151 |
-
# Create
|
|
|
152 |
internvl2_pipeline = pipeline(
|
153 |
MODEL_ID,
|
154 |
backend_config=backend_config,
|
155 |
log_level='INFO',
|
156 |
model_name_or_path=None,
|
157 |
backend_name="turbomind",
|
158 |
-
stream=False # Important: disable streaming
|
|
|
159 |
)
|
160 |
|
161 |
print("InternVL2 model loaded successfully!")
|
@@ -196,46 +208,112 @@ def analyze_image(image, prompt):
|
|
196 |
else:
|
197 |
# If somehow it's already a PIL Image
|
198 |
image_pil = image.convert('RGB')
|
|
|
|
|
|
|
|
|
199 |
|
200 |
-
#
|
201 |
-
|
202 |
-
import queue
|
203 |
-
|
204 |
-
result_queue = queue.Queue()
|
205 |
-
|
206 |
-
def run_inference_in_thread():
|
207 |
try:
|
208 |
-
#
|
209 |
-
|
210 |
-
|
211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
except Exception as e:
|
213 |
-
|
|
|
|
|
214 |
|
215 |
-
#
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
#
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
233 |
else:
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
|
|
239 |
except Exception as e:
|
240 |
print(f"Error in image analysis: {str(e)}")
|
241 |
# Try to clean up memory in case of error
|
|
|
145 |
|
146 |
print("Loading InternVL2 model...")
|
147 |
try:
|
148 |
+
# Force synchronous execution for everything
|
149 |
+
import os
|
150 |
+
# Set environment variables to force synchronous behavior
|
151 |
+
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
|
152 |
+
# Disable asyncio in lmdeploy
|
153 |
+
os.environ["LMDEPLOY_DISABLE_ASYNC"] = "1"
|
154 |
+
|
155 |
# Configure for AWQ quantized model
|
156 |
+
backend_config = TurbomindEngineConfig(
|
157 |
+
model_format='awq',
|
158 |
+
session_len=2048 # Explicitly set session length
|
159 |
+
)
|
160 |
|
161 |
+
# Create a synchronous pipeline to avoid asyncio issues
|
162 |
+
# Explicitly set all parameters that might default to async behavior
|
163 |
internvl2_pipeline = pipeline(
|
164 |
MODEL_ID,
|
165 |
backend_config=backend_config,
|
166 |
log_level='INFO',
|
167 |
model_name_or_path=None,
|
168 |
backend_name="turbomind",
|
169 |
+
stream=False, # Important: disable streaming
|
170 |
+
tensor_parallel=1, # Use single GPU to avoid distributed processing
|
171 |
)
|
172 |
|
173 |
print("InternVL2 model loaded successfully!")
|
|
|
208 |
else:
|
209 |
# If somehow it's already a PIL Image
|
210 |
image_pil = image.convert('RGB')
|
211 |
+
|
212 |
+
# We'll use a completely different approach - multiprocessing
|
213 |
+
# This runs the model in a separate process, avoiding any event loop conflicts
|
214 |
+
import multiprocessing as mp
|
215 |
|
216 |
+
# Define a function to run in a separate process
|
217 |
+
def run_in_process(prompt, image_path, result_queue):
|
|
|
|
|
|
|
|
|
|
|
218 |
try:
|
219 |
+
# Set environment variables in the subprocess
|
220 |
+
import os
|
221 |
+
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
|
222 |
+
os.environ["LMDEPLOY_DISABLE_ASYNC"] = "1"
|
223 |
+
|
224 |
+
# Import libraries inside the process
|
225 |
+
from lmdeploy import pipeline, TurbomindEngineConfig
|
226 |
+
|
227 |
+
# Save the image to a temporary file to pass between processes
|
228 |
+
import tempfile
|
229 |
+
import torch
|
230 |
+
|
231 |
+
# Check GPU in subprocess
|
232 |
+
print(f"Subprocess GPU available: {torch.cuda.is_available()}")
|
233 |
+
|
234 |
+
# Configure for AWQ quantized model
|
235 |
+
backend_config = TurbomindEngineConfig(
|
236 |
+
model_format='awq',
|
237 |
+
session_len=2048
|
238 |
+
)
|
239 |
+
|
240 |
+
# Create new pipeline in the subprocess
|
241 |
+
model_pipeline = pipeline(
|
242 |
+
MODEL_ID,
|
243 |
+
backend_config=backend_config,
|
244 |
+
log_level='INFO',
|
245 |
+
model_name_or_path=None,
|
246 |
+
backend_name="turbomind",
|
247 |
+
stream=False,
|
248 |
+
tensor_parallel=1,
|
249 |
+
)
|
250 |
+
|
251 |
+
# Load the image in the subprocess
|
252 |
+
from PIL import Image
|
253 |
+
image = Image.open(image_path).convert('RGB')
|
254 |
+
|
255 |
+
# Run inference
|
256 |
+
response = model_pipeline((prompt, image))
|
257 |
+
result = response.text if hasattr(response, "text") else str(response)
|
258 |
+
|
259 |
+
# Put the result in the queue
|
260 |
+
result_queue.put(("success", result))
|
261 |
+
|
262 |
except Exception as e:
|
263 |
+
import traceback
|
264 |
+
error_msg = f"Error in subprocess: {str(e)}\n{traceback.format_exc()}"
|
265 |
+
result_queue.put(("error", error_msg))
|
266 |
|
267 |
+
# Create a temporary file for the image
|
268 |
+
import tempfile
|
269 |
+
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
|
270 |
+
temp_path = temp_file.name
|
271 |
+
image_pil.save(temp_path)
|
272 |
|
273 |
+
try:
|
274 |
+
# Create a process-safe queue
|
275 |
+
result_queue = mp.Queue()
|
276 |
+
|
277 |
+
# Start the process
|
278 |
+
print("Starting model inference in a separate process")
|
279 |
+
process = mp.Process(
|
280 |
+
target=run_in_process,
|
281 |
+
args=(prompt, temp_path, result_queue)
|
282 |
+
)
|
283 |
+
|
284 |
+
# Make it a daemon so it terminates when the main process ends
|
285 |
+
process.daemon = True
|
286 |
+
process.start()
|
287 |
+
|
288 |
+
# Wait for the process to complete (with timeout)
|
289 |
+
process.join(timeout=180) # 3 minute timeout
|
290 |
+
|
291 |
+
# Delete the temporary file
|
292 |
+
try:
|
293 |
+
os.unlink(temp_path)
|
294 |
+
except:
|
295 |
+
pass
|
296 |
+
|
297 |
+
if process.is_alive():
|
298 |
+
# Terminate the process if it's still running after timeout
|
299 |
+
process.terminate()
|
300 |
+
return "Model inference timed out after 180 seconds. The model might be too slow on this hardware."
|
301 |
+
|
302 |
+
# Get the result from the queue (non-blocking to avoid hanging)
|
303 |
+
if not result_queue.empty():
|
304 |
+
status, result = result_queue.get(block=False)
|
305 |
+
if status == "error":
|
306 |
+
return f"Error in model inference: {result}"
|
307 |
+
else:
|
308 |
+
elapsed_time = time.time() - start_time
|
309 |
+
return result
|
310 |
else:
|
311 |
+
return "Unknown error: Model inference process completed but did not produce a result"
|
312 |
+
|
313 |
+
except Exception as e:
|
314 |
+
print(f"Error in multiprocessing: {str(e)}")
|
315 |
+
return f"Error setting up multiprocessing: {str(e)}"
|
316 |
+
|
317 |
except Exception as e:
|
318 |
print(f"Error in image analysis: {str(e)}")
|
319 |
# Try to clean up memory in case of error
|