vikas-hugging-space commited on
Commit
fe5ba63
Β·
verified Β·
1 Parent(s): e938a8e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +753 -0
app.py ADDED
@@ -0,0 +1,753 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BLIND ASSISTANCE MODEL - HUGGING FACE SPACES DEPLOYMENT
3
+ Enhanced Video Navigation System with Audio Guidance
4
+ """
5
+
6
+ import gradio as gr
7
+ import cv2
8
+ import numpy as np
9
+ from ultralytics import YOLO
10
+ from gtts import gTTS
11
+ import pygame
12
+ import os
13
+ import time
14
+ from collections import deque
15
+ from PIL import Image, ImageEnhance
16
+ import torch
17
+ import threading
18
+ from moviepy.editor import VideoFileClip, AudioFileClip, CompositeAudioClip
19
+ import tempfile
20
+ import json
21
+
22
+ # Optional imports
23
+ try:
24
+ import easyocr
25
+ EASYOCR_AVAILABLE = True
26
+ except ImportError:
27
+ EASYOCR_AVAILABLE = False
28
+ print("⚠️ EasyOCR not available")
29
+
30
+ try:
31
+ import segmentation_models_pytorch as smp
32
+ SMP_AVAILABLE = True
33
+ except ImportError:
34
+ SMP_AVAILABLE = False
35
+ print("⚠️ segmentation_models_pytorch not available")
36
+
37
+
38
+ class AudioNavigationSystem:
39
+ def __init__(self):
40
+ print("πŸš€ Initializing Blind Assistance Model...")
41
+
42
+ # Load YOLOv8 model
43
+ print("Loading YOLOv8 model...")
44
+ self.model = YOLO('yolov8n.pt')
45
+ print("βœ… Model loaded successfully!")
46
+
47
+ # Initialize Semantic Segmentation Model
48
+ print("Loading Semantic Segmentation Model...")
49
+ self.segmentation_model = self.load_segmentation_model()
50
+ print("βœ… Segmentation model loaded!")
51
+
52
+ # Define segmentation classes
53
+ self.segmentation_classes = {
54
+ 0: 'road', 1: 'sidewalk', 2: 'building', 3: 'wall', 4: 'fence',
55
+ 5: 'pole', 6: 'traffic light', 7: 'traffic sign', 8: 'vegetation',
56
+ 9: 'terrain', 10: 'sky', 11: 'person', 12: 'rider', 13: 'car',
57
+ 14: 'truck', 15: 'bus', 16: 'train', 17: 'motorcycle', 18: 'bicycle',
58
+ 19: 'void'
59
+ }
60
+
61
+ # Initialize Text Detection
62
+ print("Loading Text Detection...")
63
+ self.reader = self.load_text_detector()
64
+ print("βœ… Text detection initialized!")
65
+
66
+ # Audio system
67
+ self.use_audio = True
68
+ self.audio_files = []
69
+ self.audio_timestamps = []
70
+ self.video_start_time = None
71
+ self.speaking = False
72
+ self.audio_lock = threading.Lock()
73
+
74
+ # Navigation classes
75
+ self.navigation_classes = {
76
+ 'person': 'person', 'car': 'vehicle', 'truck': 'vehicle', 'bus': 'vehicle',
77
+ 'motorcycle': 'vehicle', 'bicycle': 'bicycle', 'traffic light': 'traffic light',
78
+ 'stop sign': 'stop sign', 'chair': 'chair', 'bench': 'bench'
79
+ }
80
+
81
+ # Priority levels
82
+ self.object_priority = {
83
+ 'important_text': 10,
84
+ 'vehicle': 5,
85
+ 'person': 4,
86
+ 'bicycle': 4,
87
+ 'traffic light': 3,
88
+ 'stop sign': 3,
89
+ 'stairs': 4,
90
+ 'curb': 4,
91
+ 'crosswalk': 3,
92
+ 'text': 2,
93
+ 'road': 1,
94
+ 'sidewalk': 1,
95
+ 'building': 1,
96
+ 'vegetation': 1
97
+ }
98
+
99
+ # Important keywords for text
100
+ self.important_keywords = [
101
+ 'exit', 'entrance', 'warning', 'danger', 'caution', 'stop',
102
+ 'stairs', 'elevator', 'escalator', 'crosswalk', 'curb',
103
+ 'emergency', 'hospital', 'police', 'fire', 'help',
104
+ 'men', 'women', 'toilet', 'restroom', 'washroom',
105
+ 'up', 'down', 'left', 'right', 'north', 'south', 'east', 'west',
106
+ 'hazard', 'attention'
107
+ ]
108
+
109
+ # Frame dimensions
110
+ self.frame_width = 0
111
+ self.frame_height = 0
112
+
113
+ # Announcement cooldown
114
+ self.last_announcement = time.time()
115
+ self.announcement_cooldown = 3
116
+
117
+ # Store detected items
118
+ self.detected_items = set()
119
+ self.text_size_reference = 100
120
+ self.last_segmentation_analysis = ""
121
+ self.segmentation_cooldown = 2
122
+
123
+ print("βœ… System initialized successfully!")
124
+
125
+ def load_text_detector(self):
126
+ """Load text detection model"""
127
+ if EASYOCR_AVAILABLE:
128
+ try:
129
+ return easyocr.Reader(['en'])
130
+ except Exception as e:
131
+ print(f"⚠️ EasyOCR initialization failed: {e}")
132
+ return None
133
+
134
+ def load_segmentation_model(self):
135
+ """Load segmentation model"""
136
+ if not SMP_AVAILABLE:
137
+ return None
138
+ try:
139
+ model = smp.Unet(
140
+ encoder_name="mobilenet_v2",
141
+ encoder_weights="voc",
142
+ classes=20,
143
+ activation=None,
144
+ )
145
+ return model
146
+ except Exception as e:
147
+ print(f"⚠️ Could not load segmentation model: {e}")
148
+ return None
149
+
150
+ def perform_semantic_segmentation(self, frame):
151
+ """Perform semantic segmentation"""
152
+ try:
153
+ h, w = frame.shape[:2]
154
+ seg_map = np.zeros((h, w), dtype=np.uint8)
155
+ hsv = cv2.cvtColor(frame, cv2.COLOR_BGR2HSV)
156
+
157
+ # Road detection
158
+ dark_mask = cv2.inRange(hsv, (0, 0, 0), (180, 255, 100))
159
+ seg_map[h//2:, :][dark_mask[h//2:, :] > 0] = 0
160
+
161
+ # Sky detection
162
+ sky_mask = cv2.inRange(hsv, (100, 50, 150), (140, 255, 255))
163
+ seg_map[:h//3, :][sky_mask[:h//3, :] > 0] = 10
164
+
165
+ return seg_map
166
+ except Exception as e:
167
+ return np.zeros((frame.shape[0], frame.shape[1]), dtype=np.uint8)
168
+
169
+ def analyze_segmentation_map(self, seg_map):
170
+ """Analyze segmentation map"""
171
+ h, w = seg_map.shape
172
+ analysis = {
173
+ 'immediate_walkable': 0,
174
+ 'immediate_obstacles': 0,
175
+ 'critical_warnings': [],
176
+ 'guidance': [],
177
+ 'environment': 'unknown'
178
+ }
179
+
180
+ immediate_path = seg_map[int(h*0.7):, :]
181
+ road_pixels = np.sum(immediate_path == 0)
182
+ total_pixels = immediate_path.size
183
+
184
+ if total_pixels > 0:
185
+ road_percentage = (road_pixels / total_pixels) * 100
186
+ if road_percentage > 60:
187
+ analysis['guidance'].append("Clear path ahead")
188
+ analysis['environment'] = 'road'
189
+ elif road_percentage > 30:
190
+ analysis['guidance'].append("Moderate path clarity")
191
+ analysis['environment'] = 'mixed'
192
+ else:
193
+ analysis['guidance'].append("Obstructed path ahead")
194
+ analysis['environment'] = 'obstructed'
195
+
196
+ return analysis
197
+
198
+ def generate_segmentation_guidance(self, seg_analysis):
199
+ """Generate guidance from segmentation"""
200
+ if not seg_analysis['guidance']:
201
+ return None
202
+
203
+ guidance = ". ".join(seg_analysis['guidance'])
204
+ if seg_analysis['environment'] == 'road':
205
+ guidance += ". You appear to be on a road."
206
+ elif seg_analysis['environment'] == 'obstructed':
207
+ guidance += ". Path may be obstructed."
208
+
209
+ return guidance
210
+
211
+ def preprocess_image_for_text(self, image):
212
+ """Preprocess image for text detection"""
213
+ pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
214
+ enhancer = ImageEnhance.Contrast(pil_image)
215
+ pil_image = enhancer.enhance(2.0)
216
+ enhancer = ImageEnhance.Sharpness(pil_image)
217
+ pil_image = enhancer.enhance(2.0)
218
+ return cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
219
+
220
+ def detect_text_easyocr(self, frame):
221
+ """Detect text using EasyOCR"""
222
+ if self.reader is None:
223
+ return []
224
+
225
+ try:
226
+ processed_frame = self.preprocess_image_for_text(frame)
227
+ gray = cv2.cvtColor(processed_frame, cv2.COLOR_BGR2GRAY)
228
+ blurred = cv2.GaussianBlur(gray, (5, 5), 0)
229
+ thresh = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
230
+ cv2.THRESH_BINARY, 11, 2)
231
+ kernel = np.ones((2, 2), np.uint8)
232
+ morphed = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
233
+ processed_for_ocr = cv2.cvtColor(morphed, cv2.COLOR_GRAY2BGR)
234
+
235
+ results = self.reader.readtext(processed_for_ocr,
236
+ decoder='beamsearch',
237
+ beamWidth=5,
238
+ batch_size=1,
239
+ height_ths=0.5,
240
+ width_ths=0.5,
241
+ min_size=20,
242
+ text_threshold=0.3,
243
+ link_threshold=0.3)
244
+
245
+ detected_texts = []
246
+ for (bbox, text, confidence) in results:
247
+ if confidence > 0.4 and len(text.strip()) > 1:
248
+ clean_text = text.strip().lower()
249
+
250
+ if len(bbox) >= 4:
251
+ y_coords = [point[1] for point in bbox]
252
+ text_height = max(y_coords) - min(y_coords)
253
+ distance = self.calculate_text_distance(text_height)
254
+ distance_category = self.get_distance_category(distance)
255
+ is_important = any(keyword in clean_text for keyword in self.important_keywords)
256
+
257
+ detected_texts.append({
258
+ 'type': 'text',
259
+ 'text': clean_text,
260
+ 'confidence': confidence,
261
+ 'bbox': bbox,
262
+ 'position': self.get_text_position(bbox),
263
+ 'distance': distance,
264
+ 'distance_category': distance_category,
265
+ 'is_important': is_important,
266
+ 'priority': 10 if is_important else 2
267
+ })
268
+
269
+ return detected_texts
270
+ except Exception as e:
271
+ print(f"Text detection error: {e}")
272
+ return []
273
+
274
+ def get_text_position(self, bbox):
275
+ """Determine text position"""
276
+ if isinstance(bbox, list) and len(bbox) == 4:
277
+ x_coords = [point[0] for point in bbox]
278
+ x_center = sum(x_coords) / len(x_coords)
279
+ third = self.frame_width / 3
280
+
281
+ if x_center < third:
282
+ return "left"
283
+ elif x_center < 2 * third:
284
+ return "center"
285
+ else:
286
+ return "right"
287
+ return "center"
288
+
289
+ def calculate_text_distance(self, bbox_height):
290
+ """Estimate text distance"""
291
+ if bbox_height <= 0:
292
+ return 10.0
293
+ distance = (self.text_size_reference * 2.0) / bbox_height
294
+ return max(0.5, min(distance, 15.0))
295
+
296
+ def get_distance_category(self, distance):
297
+ """Convert distance to category"""
298
+ if distance < 2:
299
+ return "very close"
300
+ elif distance < 4:
301
+ return "close"
302
+ elif distance < 7:
303
+ return "moderate distance"
304
+ elif distance < 10:
305
+ return "far"
306
+ else:
307
+ return "very far"
308
+
309
+ def calculate_object_distance(self, bbox_height, object_type="person"):
310
+ """Estimate object distance"""
311
+ reference_sizes = {
312
+ 'person': 1.7, 'vehicle': 1.5, 'bicycle': 1.0,
313
+ 'animal': 0.5, 'chair': 1.0, 'bench': 1.0,
314
+ 'pole': 2.0, 'default': 1.0
315
+ }
316
+ real_height = reference_sizes.get(object_type, reference_sizes['default'])
317
+ focal_length = 500
318
+
319
+ if bbox_height > 0:
320
+ distance = (focal_length * real_height) / bbox_height
321
+ return max(0.5, min(distance, 20))
322
+ return 20
323
+
324
+ def get_object_position(self, bbox):
325
+ """Determine object position"""
326
+ x_center = (bbox[0] + bbox[2]) / 2
327
+ third = self.frame_width / 3
328
+
329
+ if x_center < third:
330
+ return "left"
331
+ elif x_center < 2 * third:
332
+ return "center"
333
+ else:
334
+ return "right"
335
+
336
+ def get_comprehensive_priority(self, item):
337
+ """Calculate comprehensive priority"""
338
+ base_priority = self.object_priority.get(item.get('label', 'object'), 1)
339
+ distance = item.get('distance', 10)
340
+ distance_factor = max(0, 10 - distance) / 2
341
+ position = item.get('position', 'right')
342
+ position_factor = 2 if position == 'center' else 1
343
+
344
+ if item.get('type') == 'text':
345
+ if item.get('is_important', False):
346
+ return 10 + distance_factor
347
+ else:
348
+ return 5 + distance_factor
349
+
350
+ return base_priority * position_factor + distance_factor
351
+
352
+ def generate_comprehensive_announcement(self, all_detections):
353
+ """Generate balanced announcements"""
354
+ if not all_detections:
355
+ return "Path clear"
356
+
357
+ messages = []
358
+ all_detections.sort(key=self.get_comprehensive_priority, reverse=True)
359
+
360
+ announced_count = 0
361
+ max_announcements = 4
362
+
363
+ for item in all_detections:
364
+ if announced_count >= max_announcements:
365
+ break
366
+
367
+ item_type = item.get('type', 'object')
368
+
369
+ if item_type == 'text':
370
+ text = item['text']
371
+ position = item['position']
372
+ distance_category = item['distance_category']
373
+
374
+ if item['is_important']:
375
+ messages.append(f"IMPORTANT: {text} {distance_category} on your {position}")
376
+ else:
377
+ messages.append(f"Sign: {text} {distance_category} on your {position}")
378
+
379
+ announced_count += 1
380
+ else:
381
+ if announced_count < max_announcements:
382
+ label = item['label']
383
+ position = item['position']
384
+ distance_category = item['distance_category']
385
+
386
+ if position == "center" and item['distance'] < 3:
387
+ messages.append(f"Warning! {label} directly ahead, {distance_category}")
388
+ else:
389
+ messages.append(f"{label} on your {position}, {distance_category}")
390
+
391
+ announced_count += 1
392
+
393
+ center_objects = [item for item in all_detections
394
+ if item.get('position') == 'center' and item.get('distance', 10) < 3]
395
+
396
+ if center_objects and len(messages) < 5:
397
+ left_count = sum(1 for item in all_detections[:6] if item.get('position') == 'left')
398
+ right_count = sum(1 for item in all_detections[:6] if item.get('position') == 'right')
399
+
400
+ if left_count < right_count:
401
+ messages.append("Consider moving left")
402
+ elif right_count < left_count:
403
+ messages.append("Consider moving right")
404
+
405
+ return ". ".join(messages)
406
+
407
+ def speak_gtts(self, text, timestamp=None):
408
+ """Text-to-speech using gTTS"""
409
+ if not text or self.speaking:
410
+ return
411
+
412
+ with self.audio_lock:
413
+ self.speaking = True
414
+ try:
415
+ if timestamp is None:
416
+ if self.video_start_time:
417
+ timestamp = time.time() - self.video_start_time
418
+ else:
419
+ timestamp = 0
420
+
421
+ minutes = int(timestamp // 60)
422
+ seconds = int(timestamp % 60)
423
+ timestamp_str = f"{minutes:02d}:{seconds:02d}"
424
+
425
+ print(f"πŸ”Š [{timestamp_str}] GUIDANCE: {text}")
426
+
427
+ tts = gTTS(text=text, lang='en', slow=False)
428
+ audio_filename = f"audio_{timestamp_str.replace(':', '-')}_{int(time.time() * 1000)}.mp3"
429
+ tts.save(audio_filename)
430
+
431
+ self.audio_files.append(audio_filename)
432
+ self.audio_timestamps.append({
433
+ 'filename': audio_filename,
434
+ 'timestamp': timestamp,
435
+ 'timestamp_str': timestamp_str,
436
+ 'text': text
437
+ })
438
+
439
+ except Exception as e:
440
+ print(f"⚠️ Speech generation error: {e}")
441
+ finally:
442
+ self.speaking = False
443
+ time.sleep(0.5)
444
+
445
+ def process_frame(self, frame):
446
+ """Process video frame"""
447
+ self.frame_height, self.frame_width = frame.shape[:2]
448
+
449
+ seg_map = self.perform_semantic_segmentation(frame)
450
+ seg_analysis = self.analyze_segmentation_map(seg_map)
451
+
452
+ results = self.model(frame, conf=0.4, verbose=False)
453
+
454
+ all_detections = []
455
+ objects_info = []
456
+ text_info = []
457
+
458
+ # Process YOLO detections
459
+ for result in results:
460
+ boxes = result.boxes
461
+ for box in boxes:
462
+ x1, y1, x2, y2 = map(int, box.xyxy[0])
463
+ conf = float(box.conf[0])
464
+ cls = int(box.cls[0])
465
+ label = self.model.names[cls]
466
+
467
+ if label.lower() in self.navigation_classes:
468
+ nav_label = self.navigation_classes[label.lower()]
469
+ bbox_height = y2 - y1
470
+ distance = self.calculate_object_distance(bbox_height, nav_label)
471
+ distance_category = self.get_distance_category(distance)
472
+ position = self.get_object_position([x1, y1, x2, y2])
473
+
474
+ object_info = {
475
+ 'type': 'object',
476
+ 'label': nav_label,
477
+ 'distance': distance,
478
+ 'distance_category': distance_category,
479
+ 'position': position,
480
+ 'bbox': [x1, y1, x2, y2],
481
+ 'confidence': conf,
482
+ 'priority': self.object_priority.get(nav_label, 1)
483
+ }
484
+
485
+ objects_info.append(object_info)
486
+ all_detections.append(object_info)
487
+
488
+ # Draw bounding box
489
+ if nav_label == 'vehicle':
490
+ color = (0, 0, 255)
491
+ elif nav_label == 'person':
492
+ color = (0, 255, 255)
493
+ elif nav_label == 'bicycle':
494
+ color = (255, 0, 0)
495
+ else:
496
+ color = (0, 255, 0)
497
+
498
+ cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
499
+ label_text = f"{nav_label.upper()} {distance_category}"
500
+ (tw, th), _ = cv2.getTextSize(label_text, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)
501
+ cv2.rectangle(frame, (x1, y1-th-10), (x1+tw+10, y1), color, -1)
502
+ cv2.putText(frame, label_text, (x1+5, y1-5),
503
+ cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
504
+
505
+ # Detect text
506
+ current_time = time.time()
507
+ if (current_time - self.last_announcement) > 1.5:
508
+ text_info = self.detect_text_easyocr(frame)
509
+
510
+ new_texts = []
511
+ for text_data in text_info:
512
+ text_hash = hash(text_data['text'][:20])
513
+ if text_hash not in self.detected_items:
514
+ new_texts.append(text_data)
515
+ self.detected_items.add(text_hash)
516
+
517
+ text_info = new_texts
518
+ all_detections.extend(text_info)
519
+
520
+ # Draw text bounding boxes
521
+ for text_data in text_info:
522
+ bbox = text_data['bbox']
523
+ text = text_data['text']
524
+ is_important = text_data['is_important']
525
+
526
+ color = (255, 0, 255) if is_important else (255, 255, 0)
527
+ thickness = 3 if is_important else 2
528
+
529
+ pts = np.array(bbox, np.int32)
530
+ pts = pts.reshape((-1, 1, 2))
531
+ cv2.polylines(frame, [pts], True, color, thickness)
532
+
533
+ label_text = f"🚩 {text}" if is_important else f"TEXT: {text}"
534
+ x_coords = [point[0] for point in bbox]
535
+ y_coords = [point[1] for point in bbox]
536
+ text_x = int(min(x_coords))
537
+ text_y = int(min(y_coords)) - 10
538
+
539
+ if text_y < 20:
540
+ text_y = int(max(y_coords)) + 25
541
+
542
+ (tw, th), _ = cv2.getTextSize(label_text, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)
543
+ cv2.rectangle(frame, (text_x, text_y-th-5), (text_x+tw+10, text_y+5), color, -1)
544
+ cv2.putText(frame, label_text, (text_x+5, text_y),
545
+ cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
546
+
547
+ # Generate navigation message
548
+ message = None
549
+ if (current_time - self.last_announcement) > self.announcement_cooldown:
550
+ seg_guidance = self.generate_segmentation_guidance(seg_analysis)
551
+ object_message = self.generate_comprehensive_announcement(all_detections)
552
+
553
+ if seg_guidance and "obstructed" in seg_guidance.lower():
554
+ message = f"{seg_guidance}. {object_message}"
555
+ elif seg_guidance and object_message == "Path clear":
556
+ message = seg_guidance
557
+ else:
558
+ message = object_message
559
+
560
+ if message and message != "Path clear":
561
+ threading.Thread(target=self.speak_gtts, args=(message,)).start()
562
+ self.last_announcement = current_time
563
+
564
+ # Status overlay
565
+ overlay = frame.copy()
566
+ cv2.rectangle(overlay, (5, 5), (500, 35), (0, 0, 0), -1)
567
+ cv2.addWeighted(overlay, 0.6, frame, 0.4, 0, frame)
568
+
569
+ status_text = f"Objects: {len(objects_info)} | Texts: {len(text_info)}"
570
+ cv2.putText(frame, status_text, (15, 28),
571
+ cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
572
+
573
+ # Draw center danger zone
574
+ center_objects = [obj for obj in objects_info if obj['position'] == 'center' and obj['distance'] < 3]
575
+ if center_objects:
576
+ cv2.rectangle(frame, (self.frame_width//3, self.frame_height-100),
577
+ (2*self.frame_width//3, self.frame_height-10), (0, 0, 255), 3)
578
+ cv2.putText(frame, "OBSTACLE IN PATH", (self.frame_width//3 + 20, self.frame_height-50),
579
+ cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 2)
580
+
581
+ return frame, message, len(objects_info), len(text_info)
582
+
583
+ def process_video(self, video_path, output_path='output_navigation.mp4'):
584
+ """Process uploaded video"""
585
+ cap = cv2.VideoCapture(video_path)
586
+
587
+ fps = int(cap.get(cv2.CAP_PROP_FPS))
588
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
589
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
590
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
591
+
592
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v')
593
+ out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
594
+
595
+ print(f"Processing video: {total_frames} frames at {fps} FPS")
596
+
597
+ self.audio_timestamps = []
598
+ self.audio_files = []
599
+ self.detected_items = set()
600
+ self.video_start_time = time.time()
601
+ frame_count = 0
602
+
603
+ try:
604
+ while cap.isOpened():
605
+ ret, frame = cap.read()
606
+ if not ret:
607
+ break
608
+
609
+ processed_frame, message, obj_count, text_count = self.process_frame(frame)
610
+ out.write(processed_frame)
611
+ frame_count += 1
612
+
613
+ if frame_count % 30 == 0:
614
+ progress = (frame_count / total_frames) * 100
615
+ print(f"Progress: {progress:.1f}%")
616
+
617
+ finally:
618
+ cap.release()
619
+ out.release()
620
+ print(f"βœ… Video processing complete!")
621
+
622
+ if self.audio_timestamps:
623
+ final_output = 'final_with_audio.mp4'
624
+ return self.merge_audio_into_video(output_path, final_output)
625
+ else:
626
+ return output_path
627
+
628
+ def merge_audio_into_video(self, video_path, output_path='final_with_audio.mp4'):
629
+ """Merge audio into video"""
630
+ print("🎡 Merging audio into video...")
631
+
632
+ if not self.audio_timestamps:
633
+ return video_path
634
+
635
+ try:
636
+ video = VideoFileClip(video_path)
637
+ video_duration = video.duration
638
+
639
+ audio_clips = []
640
+ for audio_info in self.audio_timestamps:
641
+ if os.path.exists(audio_info['filename']):
642
+ try:
643
+ audio_clip = AudioFileClip(audio_info['filename'])
644
+ audio_clip = audio_clip.set_start(audio_info['timestamp'])
645
+ audio_clips.append(audio_clip)
646
+ except Exception as e:
647
+ print(f"⚠️ Failed to load {audio_info['filename']}: {e}")
648
+
649
+ if not audio_clips:
650
+ return video_path
651
+
652
+ final_audio = CompositeAudioClip(audio_clips)
653
+ final_audio = final_audio.set_duration(video_duration)
654
+ final_video = video.set_audio(final_audio)
655
+
656
+ final_video.write_videofile(
657
+ output_path,
658
+ codec='libx264',
659
+ audio_codec='aac',
660
+ fps=video.fps,
661
+ verbose=False,
662
+ logger=None
663
+ )
664
+
665
+ video.close()
666
+ final_video.close()
667
+ final_audio.close()
668
+ for clip in audio_clips:
669
+ clip.close()
670
+
671
+ print(f"βœ… Video with audio saved!")
672
+ return output_path
673
+
674
+ except Exception as e:
675
+ print(f"❌ Error merging audio: {e}")
676
+ return video_path
677
+
678
+
679
+ # Initialize the system
680
+ nav_system = AudioNavigationSystem()
681
+
682
+
683
+ def process_video_gradio(video_file):
684
+ """Gradio interface function"""
685
+ try:
686
+ if video_file is None:
687
+ return None, "Please upload a video file"
688
+
689
+ # Create temporary file
690
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tmp_input:
691
+ tmp_input.write(video_file)
692
+ input_path = tmp_input.name
693
+
694
+ # Check video duration
695
+ cap = cv2.VideoCapture(input_path)
696
+ fps = cap.get(cv2.CAP_PROP_FPS)
697
+ frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
698
+ duration = frame_count / fps if fps > 0 else 0
699
+ cap.release()
700
+
701
+ if duration > 15:
702
+ return None, f"⚠️ Video is {duration:.1f} seconds long. Please upload a video shorter than 15 seconds."
703
+
704
+ # Process video
705
+ output_path = nav_system.process_video(input_path)
706
+
707
+ # Generate transcript
708
+ transcript_text = "Audio Guidance Transcript:\n\n"
709
+ for item in nav_system.audio_timestamps:
710
+ transcript_text += f"[{item['timestamp_str']}] {item['text']}\n\n"
711
+
712
+ return output_path, transcript_text
713
+
714
+ except Exception as e:
715
+ return None, f"Error processing video: {str(e)}"
716
+
717
+
718
+ # Create Gradio interface
719
+ with gr.Blocks(title="Blind Assistance AI", theme=gr.themes.Soft()) as demo:
720
+ gr.Markdown("""
721
+ # 🦯 Blind Assistance AI - Video Navigation System
722
+
723
+ Upload a video to receive audio navigation guidance with object detection, text recognition, and scene analysis.
724
+
725
+ ⚠️ **Important:** Please upload videos **shorter than 15 seconds** for optimal processing.
726
+ """)
727
+
728
+ with gr.Row():
729
+ with gr.Column():
730
+ video_input = gr.Video(label="Upload Video (Max 15 seconds)")
731
+ process_btn = gr.Button("Process Video", variant="primary", size="lg")
732
+
733
+ with gr.Column():
734
+ video_output = gr.Video(label="Processed Video with Audio Guidance")
735
+ transcript_output = gr.Textbox(label="Audio Transcript", lines=10)
736
+
737
+ gr.Markdown("""
738
+ ### Features:
739
+ - 🎯 **Object Detection**: Identifies people, vehicles, and obstacles
740
+ - πŸ“ **Text Detection & OCR**: Reads signs, labels, and important text
741
+ - πŸ—ΊοΈ **Scene Analysis**: Understands environment and context
742
+ - πŸ”Š **Voice Guidance**: Real-time audio navigation instructions
743
+ """)
744
+
745
+ process_btn.click(
746
+ fn=process_video_gradio,
747
+ inputs=[video_input],
748
+ outputs=[video_output, transcript_output]
749
+ )
750
+
751
+ # Launch the app
752
+ if __name__ == "__main__":
753
+ demo.launch()