hamza2923 commited on
Commit
114627d
·
verified ·
1 Parent(s): 48cf6b1

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +186 -1
main.py CHANGED
@@ -101,22 +101,25 @@ async def get_transcript(request: VideoRequest):
101
  logger.info("Accepted cookies")
102
  except TimeoutException:
103
  logger.info("No cookie consent found")
104
- pass
105
 
 
106
  more_button = WebDriverWait(driver, 10).until(
107
  EC.element_to_be_clickable((By.ID, "expand"))
108
  )
109
  driver.execute_script("arguments[0].click();", more_button)
110
 
 
111
  transcript_button = WebDriverWait(driver, 10).until(
112
  EC.element_to_be_clickable((By.CSS_SELECTOR, "button[aria-label='Show transcript']"))
113
  )
114
  driver.execute_script("arguments[0].click();", transcript_button)
115
 
 
116
  WebDriverWait(driver, 15).until(
117
  EC.presence_of_element_located((By.ID, "segments-container"))
118
  )
119
 
 
120
  segments = driver.find_elements(By.CSS_SELECTOR, "div.ytd-transcript-segment-renderer")
121
  transcript = []
122
  for segment in segments:
@@ -130,6 +133,188 @@ async def get_transcript(request: VideoRequest):
130
  if not transcript:
131
  raise HTTPException(status_code=404, detail="No transcript available")
132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  return TranscriptResponse(
134
  success=True,
135
  transcript=transcript,
 
101
  logger.info("Accepted cookies")
102
  except TimeoutException:
103
  logger.info("No cookie consent found")
 
104
 
105
+ logger.info("Clicking 'Show more' button")
106
  more_button = WebDriverWait(driver, 10).until(
107
  EC.element_to_be_clickable((By.ID, "expand"))
108
  )
109
  driver.execute_script("arguments[0].click();", more_button)
110
 
111
+ logger.info("Clicking transcript button")
112
  transcript_button = WebDriverWait(driver, 10).until(
113
  EC.element_to_be_clickable((By.CSS_SELECTOR, "button[aria-label='Show transcript']"))
114
  )
115
  driver.execute_script("arguments[0].click();", transcript_button)
116
 
117
+ logger.info("Waiting for transcript segments")
118
  WebDriverWait(driver, 15).until(
119
  EC.presence_of_element_located((By.ID, "segments-container"))
120
  )
121
 
122
+ logger.info("Extracting transcript")
123
  segments = driver.find_elements(By.CSS_SELECTOR, "div.ytd-transcript-segment-renderer")
124
  transcript = []
125
  for segment in segments:
 
133
  if not transcript:
134
  raise HTTPException(status_code=404, detail="No transcript available")
135
 
136
+ logger.info(f"Extracted {len(transcript)} transcript segments")
137
+ return TranscriptResponse(
138
+ success=True,
139
+ transcript=transcript,
140
+ error=None,
141
+ processing_time=time.time() - start_time
142
+ )
143
+
144
+ except TimeoutException as e:
145
+ error_msg = "Timed out waiting for page elements - the video might not have transcripts"
146
+ logger.error(error_msg)
147
+ return TranscriptResponse(
148
+ success=False,
149
+ transcript=None,
150
+ error=error_msg,
151
+ processing_time=time.time() - start_time
152
+ )
153
+ except Exception as e:
154
+ logger.error(f"Error: {str(e)}")
155
+ return TranscriptResponse(
156
+ success=False,
157
+ transcript=None,
158
+ error=str(e),
159
+ processing_time=time.time() - start_time
160
+ )
161
+ finally:
162
+ if driver:
163
+ driver.quit()
164
+
165
+ @app.get("/health")
166
+ def health_check():
167
+ chrome_path = shutil.which("google-chrome")
168
+ chromedriver_path = shutil.which("chromedriver")
169
+ return {
170
+ "ChromePath": chrome_path,
171
+ "ChromeDriverPath": chromedriver_path,
172
+ "ChromeExists": Path(chrome_path or "").exists(),
173
+ "ChromeDriverExists": Path(chromedriver_path or "").exists()
174
+ }
175
+
176
+ @app.get("/")
177
+ async def root():
178
+ return {"message": "Welcome to YouTube Transcript API"}
179
+
180
+ if __name__ == "__main__":
181
+ import uvicorn
182
+ uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 7860)))from fastapi import FastAPI, HTTPException
183
+ from fastapi.middleware.cors import CORSMiddleware
184
+ from pydantic import BaseModel
185
+ from selenium import webdriver
186
+ from selenium.webdriver.chrome.service import Service
187
+ from selenium.webdriver.chrome.options import Options
188
+ from selenium.webdriver.common.by import By
189
+ from selenium.webdriver.support.ui import WebDriverWait
190
+ from selenium.webdriver.support import expected_conditions as EC
191
+ from selenium.common.exceptions import TimeoutException
192
+ import time
193
+ import logging
194
+ import os
195
+ import shutil
196
+ from pathlib import Path
197
+
198
+ app = FastAPI()
199
+
200
+ # Configure CORS
201
+ app.add_middleware(
202
+ CORSMiddleware,
203
+ allow_origins=["*"],
204
+ allow_credentials=True,
205
+ allow_methods=["*"],
206
+ allow_headers=["*"],
207
+ )
208
+
209
+ # Configure logging
210
+ logging.basicConfig(level=logging.INFO)
211
+ logger = logging.getLogger(__name__)
212
+
213
+ # Pydantic models
214
+ class VideoRequest(BaseModel):
215
+ url: str
216
+
217
+ class TranscriptResponse(BaseModel):
218
+ success: bool
219
+ transcript: list[str] | None
220
+ error: str | None
221
+ processing_time: float
222
+
223
+ def init_driver():
224
+ options = Options()
225
+ options.add_argument("--headless=new")
226
+ options.add_argument("--no-sandbox")
227
+ options.add_argument("--disable-dev-shm-usage")
228
+ options.add_argument("--disable-gpu")
229
+
230
+ possible_chrome_paths = [
231
+ "/usr/bin/google-chrome",
232
+ "/usr/bin/google-chrome-stable",
233
+ ]
234
+ chrome_path = None
235
+ for path in possible_chrome_paths:
236
+ if os.path.exists(path):
237
+ chrome_path = path
238
+ break
239
+
240
+ if not chrome_path:
241
+ logger.error(f"No Chrome binary found in paths: {possible_chrome_paths}")
242
+ raise Exception(f"No Chrome binary found in paths: {possible_chrome_paths}")
243
+
244
+ options.binary_location = chrome_path
245
+ logger.info(f"Using Chrome binary: {chrome_path}")
246
+
247
+ try:
248
+ chromedriver_path = shutil.which("chromedriver")
249
+ if not chromedriver_path or not os.path.exists(chromedriver_path):
250
+ logger.error(f"ChromeDriver not found at {chromedriver_path}")
251
+ raise Exception(f"ChromeDriver not found at {chromedriver_path}")
252
+
253
+ service = Service(executable_path=chromedriver_path)
254
+ driver = webdriver.Chrome(service=service, options=options)
255
+ chrome_version = driver.capabilities["browserVersion"]
256
+ chromedriver_version = driver.capabilities["chrome"]["chromedriverVersion"].split()[0]
257
+ logger.info(f"Chrome version: {chrome_version}, ChromeDriver version: {chromedriver_version}")
258
+ return driver
259
+ except Exception as e:
260
+ logger.error(f"Driver initialization failed: {str(e)}")
261
+ raise Exception(f"Driver initialization failed: {str(e)}")
262
+
263
+ @app.post("/transcript", response_model=TranscriptResponse)
264
+ async def get_transcript(request: VideoRequest):
265
+ start_time = time.time()
266
+ driver = None
267
+
268
+ try:
269
+ video_url = request.url
270
+ if not ("youtube.com" in video_url or "youtu.be" in video_url):
271
+ raise HTTPException(status_code=400, detail="Invalid YouTube URL")
272
+
273
+ driver = init_driver()
274
+ logger.info(f"Processing URL: {video_url}")
275
+ driver.get(video_url)
276
+
277
+ try:
278
+ cookie_button = WebDriverWait(driver, 5).until(
279
+ EC.element_to_be_clickable((By.XPATH, "//*[contains(text(), 'Accept all')]"))
280
+ )
281
+ cookie_button.click()
282
+ logger.info("Accepted cookies")
283
+ except TimeoutException:
284
+ logger.info("No cookie consent found")
285
+
286
+ logger.info("Clicking 'Show more' button")
287
+ more_button = WebDriverWait(driver, 10).until(
288
+ EC.element_to_be_clickable((By.ID, "expand"))
289
+ )
290
+ driver.execute_script("arguments[0].click();", more_button)
291
+
292
+ logger.info("Clicking transcript button")
293
+ transcript_button = WebDriverWait(driver, 10).until(
294
+ EC.element_to_be_clickable((By.CSS_SELECTOR, "button[aria-label='Show transcript']"))
295
+ )
296
+ driver.execute_script("arguments[0].click();", transcript_button)
297
+
298
+ logger.info("Waiting for transcript segments")
299
+ WebDriverWait(driver, 15).until(
300
+ EC.presence_of_element_located((By.ID, "segments-container"))
301
+ )
302
+
303
+ logger.info("Extracting transcript")
304
+ segments = driver.find_elements(By.CSS_SELECTOR, "div.ytd-transcript-segment-renderer")
305
+ transcript = []
306
+ for segment in segments:
307
+ try:
308
+ text = segment.find_element(By.CLASS_NAME, "segment-text").text.strip()
309
+ if text:
310
+ transcript.append(text)
311
+ except:
312
+ continue
313
+
314
+ if not transcript:
315
+ raise HTTPException(status_code=404, detail="No transcript available")
316
+
317
+ logger.info(f"Extracted {len(transcript)} transcript segments")
318
  return TranscriptResponse(
319
  success=True,
320
  transcript=transcript,