hamza2923 commited on
Commit
df8c9a8
·
verified ·
1 Parent(s): 9ee640f

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +78 -58
main.py CHANGED
@@ -2,7 +2,7 @@ from fastapi import FastAPI, HTTPException
2
  from fastapi.middleware.cors import CORSMiddleware
3
  from pydantic import BaseModel
4
  from selenium import webdriver
5
- from selenium.webdriver.chrome.service import Service
6
  from selenium.webdriver.chrome.options import Options
7
  from selenium.webdriver.common.by import By
8
  from selenium.webdriver.support.ui import WebDriverWait
@@ -13,6 +13,9 @@ import logging
13
  import os
14
  import shutil
15
  from pathlib import Path
 
 
 
16
 
17
  app = FastAPI()
18
 
@@ -39,7 +42,16 @@ class TranscriptResponse(BaseModel):
39
  error: str | None
40
  processing_time: float
41
 
42
- # Driver init and route handlers here...
 
 
 
 
 
 
 
 
 
43
 
44
  def init_driver():
45
  options = Options()
@@ -47,36 +59,35 @@ def init_driver():
47
  options.add_argument("--no-sandbox")
48
  options.add_argument("--disable-dev-shm-usage")
49
  options.add_argument("--disable-gpu")
50
-
51
- # Try multiple possible Chrome binary locations
52
  possible_chrome_paths = [
53
  "/usr/bin/google-chrome",
54
  "/usr/bin/google-chrome-stable",
55
- "/usr/lib/chromium-browser/chrome",
56
- "/usr/bin/chromium"
57
  ]
58
  chrome_path = None
59
  for path in possible_chrome_paths:
60
  if os.path.exists(path):
61
  chrome_path = path
62
  break
63
-
64
  if not chrome_path:
65
  logger.error(f"No Chrome binary found in paths: {possible_chrome_paths}")
66
  raise Exception(f"No Chrome binary found in paths: {possible_chrome_paths}")
67
-
68
  options.binary_location = chrome_path
69
  logger.info(f"Using Chrome binary: {chrome_path}")
70
-
71
  try:
72
- chromedriver_path = "/usr/bin/chromedriver"
73
- if not os.path.exists(chromedriver_path):
74
  logger.error(f"ChromeDriver not found at {chromedriver_path}")
75
  raise Exception(f"ChromeDriver not found at {chromedriver_path}")
76
-
77
  service = Service(executable_path=chromedriver_path)
78
  driver = webdriver.Chrome(service=service, options=options)
79
- logger.info("ChromeDriver initialized successfully")
 
 
80
  return driver
81
  except Exception as e:
82
  logger.error(f"Driver initialization failed: {str(e)}")
@@ -86,65 +97,74 @@ def init_driver():
86
  async def get_transcript(request: VideoRequest):
87
  start_time = time.time()
88
  driver = None
89
-
90
  try:
91
  video_url = request.url
92
- if not ("youtube.com" in video_url or "youtu.be" in video_url):
93
  raise HTTPException(status_code=400, detail="Invalid YouTube URL")
94
 
95
  driver = init_driver()
96
  logger.info(f"Processing URL: {video_url}")
97
  driver.get(video_url)
98
 
99
- # Handle cookie consent if it appears
100
- try:
101
- cookie_button = WebDriverWait(driver, 5).until(
102
- EC.element_to_be_clickable((By.XPATH, "//*[contains(text(), 'Accept all')]"))
 
 
 
 
 
 
 
 
 
103
  )
104
- cookie_button.click()
105
- logger.info("Accepted cookies")
106
- except TimeoutException:
107
- logger.info("No cookie consent found")
108
- pass
109
-
110
- # Click more button
111
- more_button = WebDriverWait(driver, 10).until(
112
- EC.element_to_be_clickable((By.ID, "expand"))
113
- )
114
- driver.execute_script("arguments[0].click();", more_button)
115
-
116
- # Click transcript button
117
- transcript_button = WebDriverWait(driver, 10).until(
118
- EC.element_to_be_clickable((By.CSS_SELECTOR, "button[aria-label='Show transcript']"))
119
- )
120
- driver.execute_script("arguments[0].click();", transcript_button)
121
 
122
- # Wait for transcript
123
- WebDriverWait(driver, 15).until(
124
- EC.presence_of_element_located((By.ID, "segments-container"))
125
- )
 
126
 
127
- # Extract transcript
128
- segments = driver.find_elements(By.CSS_SELECTOR, "div.ytd-transcript-segment-renderer")
129
- transcript = []
130
- for segment in segments:
131
- try:
132
- text = segment.find_element(By.CLASS_NAME, "segment-text").text.strip()
133
- if text:
134
- transcript.append(text)
135
- except:
136
- continue
137
 
138
- if not transcript:
139
- raise HTTPException(status_code=404, detail="No transcript available")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
 
 
 
141
  return TranscriptResponse(
142
- success=True,
143
- transcript=transcript,
144
- error=None,
145
  processing_time=time.time() - start_time
146
  )
147
-
148
  except TimeoutException as e:
149
  error_msg = "Timed out waiting for page elements - the video might not have transcripts"
150
  logger.error(error_msg)
@@ -165,7 +185,7 @@ async def get_transcript(request: VideoRequest):
165
  finally:
166
  if driver:
167
  driver.quit()
168
-
169
  @app.get("/health")
170
  def health_check():
171
  chrome_path = shutil.which("google-chrome")
@@ -183,4 +203,4 @@ async def root():
183
 
184
  if __name__ == "__main__":
185
  import uvicorn
186
- uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 7860)))
 
2
  from fastapi.middleware.cors import CORSMiddleware
3
  from pydantic import BaseModel
4
  from selenium import webdriver
5
+ from selenium.webdriver.chrome.service import Oceans
6
  from selenium.webdriver.chrome.options import Options
7
  from selenium.webdriver.common.by import By
8
  from selenium.webdriver.support.ui import WebDriverWait
 
13
  import os
14
  import shutil
15
  from pathlib import Path
16
+ import re
17
+ from contextlib import contextmanager
18
+ import signal
19
 
20
  app = FastAPI()
21
 
 
42
  error: str | None
43
  processing_time: float
44
 
45
+ @contextmanager
46
+ def timeout(seconds):
47
+ def signal_handler(signum, frame):
48
+ raise TimeoutError("Operation timed out")
49
+ signal.signal(signal.SIGALRM, signal_handler)
50
+ signal.alarm(seconds)
51
+ try:
52
+ yield
53
+ finally:
54
+ signal.alarm(0)
55
 
56
  def init_driver():
57
  options = Options()
 
59
  options.add_argument("--no-sandbox")
60
  options.add_argument("--disable-dev-shm-usage")
61
  options.add_argument("--disable-gpu")
62
+
 
63
  possible_chrome_paths = [
64
  "/usr/bin/google-chrome",
65
  "/usr/bin/google-chrome-stable",
 
 
66
  ]
67
  chrome_path = None
68
  for path in possible_chrome_paths:
69
  if os.path.exists(path):
70
  chrome_path = path
71
  break
72
+
73
  if not chrome_path:
74
  logger.error(f"No Chrome binary found in paths: {possible_chrome_paths}")
75
  raise Exception(f"No Chrome binary found in paths: {possible_chrome_paths}")
76
+
77
  options.binary_location = chrome_path
78
  logger.info(f"Using Chrome binary: {chrome_path}")
79
+
80
  try:
81
+ chromedriver_path = shutil.which("chromedriver")
82
+ if not chromedriver_path or not os.path.exists(chromedriver_path):
83
  logger.error(f"ChromeDriver not found at {chromedriver_path}")
84
  raise Exception(f"ChromeDriver not found at {chromedriver_path}")
85
+
86
  service = Service(executable_path=chromedriver_path)
87
  driver = webdriver.Chrome(service=service, options=options)
88
+ chrome_version = driver.capabilities["browserVersion"]
89
+ chromedriver_version = driver.capabilities["chrome"]["chromedriverVersion"].split()[0]
90
+ logger.info(f"Chrome version: {chrome_version}, ChromeDriver version: {chromedriver_version}")
91
  return driver
92
  except Exception as e:
93
  logger.error(f"Driver initialization failed: {str(e)}")
 
97
  async def get_transcript(request: VideoRequest):
98
  start_time = time.time()
99
  driver = None
100
+
101
  try:
102
  video_url = request.url
103
+ if not re.match(r"(?:https?://)?(?:www\.)?(?:youtube\.com/watch\?v=|youtu\.be/)([a-zA-Z0-9_-]{11})", video_url):
104
  raise HTTPException(status_code=400, detail="Invalid YouTube URL")
105
 
106
  driver = init_driver()
107
  logger.info(f"Processing URL: {video_url}")
108
  driver.get(video_url)
109
 
110
+ with timeout(60):
111
+ try:
112
+ cookie_button = WebDriverWait(driver, 3).until(
113
+ EC.element_to_be_clickable((By.XPATH, "//*[contains(text(), 'Accept all')]"))
114
+ )
115
+ cookie_button.click()
116
+ logger.info("Accepted cookies")
117
+ except TimeoutException:
118
+ logger.info("No cookie consent found")
119
+
120
+ logger.info("Clicking 'Show more' button")
121
+ more_button = WebDriverWait(driver, 10).until(
122
+ EC.element_to_be_clickable((By.ID, "expand"))
123
  )
124
+ driver.execute_script("arguments[0].click();", more_button)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
+ logger.info("Clicking transcript button")
127
+ transcript_button = WebDriverWait(driver, 10).until(
128
+ EC.element_to_be_clickable((By.CSS_SELECTOR, "button[aria-label='Show transcript']"))
129
+ )
130
+ driver.execute_script("arguments[0].click();", transcript_button)
131
 
132
+ logger.info("Waiting for transcript segments")
133
+ WebDriverWait(driver, 15).until(
134
+ EC.presence_of_element_located((By.ID, "segments-container"))
135
+ )
 
 
 
 
 
 
136
 
137
+ logger.info("Extracting transcript")
138
+ segments = driver.find_elements(By.CSS_SELECTOR, "div.ytd-transcript-segment-renderer")
139
+ transcript = []
140
+ for segment in segments:
141
+ try:
142
+ text = segment.find_element(By.CLASS_NAME, "segment-text").text.strip()
143
+ if text:
144
+ transcript.append(text)
145
+ except:
146
+ continue
147
+
148
+ if not transcript:
149
+ raise HTTPException(status_code=404, detail="No transcript available")
150
+
151
+ logger.info(f"Extracted {len(transcript)} transcript segments")
152
+ return TranscriptResponse(
153
+ success=True,
154
+ transcript=transcript,
155
+ error=None,
156
+ processing_time=time.time() - start_time
157
+ )
158
 
159
+ except TimeoutError:
160
+ error_msg = "Request timed out"
161
+ logger.error(error_msg)
162
  return TranscriptResponse(
163
+ success=False,
164
+ transcript=None,
165
+ error=error_msg,
166
  processing_time=time.time() - start_time
167
  )
 
168
  except TimeoutException as e:
169
  error_msg = "Timed out waiting for page elements - the video might not have transcripts"
170
  logger.error(error_msg)
 
185
  finally:
186
  if driver:
187
  driver.quit()
188
+
189
  @app.get("/health")
190
  def health_check():
191
  chrome_path = shutil.which("google-chrome")
 
203
 
204
  if __name__ == "__main__":
205
  import uvicorn
206
+ uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 7860)))