hamza2923 commited on
Commit
0d58c5b
·
verified ·
1 Parent(s): aeece2a

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +38 -66
main.py CHANGED
@@ -2,7 +2,7 @@ from fastapi import FastAPI, HTTPException
2
  from fastapi.middleware.cors import CORSMiddleware
3
  from pydantic import BaseModel
4
  from selenium import webdriver
5
- from selenium.webdriver.chrome.service import Oceans
6
  from selenium.webdriver.chrome.options import Options
7
  from selenium.webdriver.common.by import By
8
  from selenium.webdriver.support.ui import WebDriverWait
@@ -13,9 +13,6 @@ import logging
13
  import os
14
  import shutil
15
  from pathlib import Path
16
- import re
17
- from contextlib import contextmanager
18
- import signal
19
 
20
  app = FastAPI()
21
 
@@ -42,17 +39,6 @@ class TranscriptResponse(BaseModel):
42
  error: str | None
43
  processing_time: float
44
 
45
- @contextmanager
46
- def timeout(seconds):
47
- def signal_handler(signum, frame):
48
- raise TimeoutError("Operation timed out")
49
- signal.signal(signal.SIGALRM, signal_handler)
50
- signal.alarm(seconds)
51
- try:
52
- yield
53
- finally:
54
- signal.alarm(0)
55
-
56
  def init_driver():
57
  options = Options()
58
  options.add_argument("--headless=new")
@@ -100,71 +86,57 @@ async def get_transcript(request: VideoRequest):
100
 
101
  try:
102
  video_url = request.url
103
- if not re.match(r"(?:https?://)?(?:www\.)?(?:youtube\.com/watch\?v=|youtu\.be/)([a-zA-Z0-9_-]{11})", video_url):
104
  raise HTTPException(status_code=400, detail="Invalid YouTube URL")
105
 
106
  driver = init_driver()
107
  logger.info(f"Processing URL: {video_url}")
108
  driver.get(video_url)
109
 
110
- with timeout(60):
111
- try:
112
- cookie_button = WebDriverWait(driver, 3).until(
113
- EC.element_to_be_clickable((By.XPATH, "//*[contains(text(), 'Accept all')]"))
114
- )
115
- cookie_button.click()
116
- logger.info("Accepted cookies")
117
- except TimeoutException:
118
- logger.info("No cookie consent found")
119
-
120
- logger.info("Clicking 'Show more' button")
121
- more_button = WebDriverWait(driver, 10).until(
122
- EC.element_to_be_clickable((By.ID, "expand"))
123
  )
124
- driver.execute_script("arguments[0].click();", more_button)
 
 
 
 
 
 
 
 
 
125
 
126
- logger.info("Clicking transcript button")
127
- transcript_button = WebDriverWait(driver, 10).until(
128
- EC.element_to_be_clickable((By.CSS_SELECTOR, "button[aria-label='Show transcript']"))
129
- )
130
- driver.execute_script("arguments[0].click();", transcript_button)
131
 
132
- logger.info("Waiting for transcript segments")
133
- WebDriverWait(driver, 15).until(
134
- EC.presence_of_element_located((By.ID, "segments-container"))
135
- )
136
 
137
- logger.info("Extracting transcript")
138
- segments = driver.find_elements(By.CSS_SELECTOR, "div.ytd-transcript-segment-renderer")
139
- transcript = []
140
- for segment in segments:
141
- try:
142
- text = segment.find_element(By.CLASS_NAME, "segment-text").text.strip()
143
- if text:
144
- transcript.append(text)
145
- except:
146
- continue
147
-
148
- if not transcript:
149
- raise HTTPException(status_code=404, detail="No transcript available")
150
-
151
- logger.info(f"Extracted {len(transcript)} transcript segments")
152
- return TranscriptResponse(
153
- success=True,
154
- transcript=transcript,
155
- error=None,
156
- processing_time=time.time() - start_time
157
- )
158
 
159
- except TimeoutError:
160
- error_msg = "Request timed out"
161
- logger.error(error_msg)
162
  return TranscriptResponse(
163
- success=False,
164
- transcript=None,
165
- error=error_msg,
166
  processing_time=time.time() - start_time
167
  )
 
168
  except TimeoutException as e:
169
  error_msg = "Timed out waiting for page elements - the video might not have transcripts"
170
  logger.error(error_msg)
 
2
  from fastapi.middleware.cors import CORSMiddleware
3
  from pydantic import BaseModel
4
  from selenium import webdriver
5
+ from selenium.webdriver.chrome.service import Service
6
  from selenium.webdriver.chrome.options import Options
7
  from selenium.webdriver.common.by import By
8
  from selenium.webdriver.support.ui import WebDriverWait
 
13
  import os
14
  import shutil
15
  from pathlib import Path
 
 
 
16
 
17
  app = FastAPI()
18
 
 
39
  error: str | None
40
  processing_time: float
41
 
 
 
 
 
 
 
 
 
 
 
 
42
  def init_driver():
43
  options = Options()
44
  options.add_argument("--headless=new")
 
86
 
87
  try:
88
  video_url = request.url
89
+ if not ("youtube.com" in video_url or "youtu.be" in video_url):
90
  raise HTTPException(status_code=400, detail="Invalid YouTube URL")
91
 
92
  driver = init_driver()
93
  logger.info(f"Processing URL: {video_url}")
94
  driver.get(video_url)
95
 
96
+ try:
97
+ cookie_button = WebDriverWait(driver, 5).until(
98
+ EC.element_to_be_clickable((By.XPATH, "//*[contains(text(), 'Accept all')]"))
 
 
 
 
 
 
 
 
 
 
99
  )
100
+ cookie_button.click()
101
+ logger.info("Accepted cookies")
102
+ except TimeoutException:
103
+ logger.info("No cookie consent found")
104
+ pass
105
+
106
+ more_button = WebDriverWait(driver, 10).until(
107
+ EC.element_to_be_clickable((By.ID, "expand"))
108
+ )
109
+ driver.execute_script("arguments[0].click();", more_button)
110
 
111
+ transcript_button = WebDriverWait(driver, 10).until(
112
+ EC.element_to_be_clickable((By.CSS_SELECTOR, "button[aria-label='Show transcript']"))
113
+ )
114
+ driver.execute_script("arguments[0].click();", transcript_button)
 
115
 
116
+ WebDriverWait(driver, 15).until(
117
+ EC.presence_of_element_located((By.ID, "segments-container"))
118
+ )
 
119
 
120
+ segments = driver.find_elements(By.CSS_SELECTOR, "div.ytd-transcript-segment-renderer")
121
+ transcript = []
122
+ for segment in segments:
123
+ try:
124
+ text = segment.find_element(By.CLASS_NAME, "segment-text").text.strip()
125
+ if text:
126
+ transcript.append(text)
127
+ except:
128
+ continue
129
+
130
+ if not transcript:
131
+ raise HTTPException(status_code=404, detail="No transcript available")
 
 
 
 
 
 
 
 
 
132
 
 
 
 
133
  return TranscriptResponse(
134
+ success=True,
135
+ transcript=transcript,
136
+ error=None,
137
  processing_time=time.time() - start_time
138
  )
139
+
140
  except TimeoutException as e:
141
  error_msg = "Timed out waiting for page elements - the video might not have transcripts"
142
  logger.error(error_msg)