hamza2923 commited on
Commit
2e271fb
·
verified ·
1 Parent(s): 7be36d3

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +1 -181
main.py CHANGED
@@ -45,187 +45,7 @@ def init_driver():
45
  options.add_argument("--no-sandbox")
46
  options.add_argument("--disable-dev-shm-usage")
47
  options.add_argument("--disable-gpu")
48
-
49
- possible_chrome_paths = [
50
- "/usr/bin/google-chrome",
51
- "/usr/bin/google-chrome-stable",
52
- ]
53
- chrome_path = None
54
- for path in possible_chrome_paths:
55
- if os.path.exists(path):
56
- chrome_path = path
57
- break
58
-
59
- if not chrome_path:
60
- logger.error(f"No Chrome binary found in paths: {possible_chrome_paths}")
61
- raise Exception(f"No Chrome binary found in paths: {possible_chrome_paths}")
62
-
63
- options.binary_location = chrome_path
64
- logger.info(f"Using Chrome binary: {chrome_path}")
65
-
66
- try:
67
- chromedriver_path = shutil.which("chromedriver")
68
- if not chromedriver_path or not os.path.exists(chromedriver_path):
69
- logger.error(f"ChromeDriver not found at {chromedriver_path}")
70
- raise Exception(f"ChromeDriver not found at {chromedriver_path}")
71
-
72
- service = Service(executable_path=chromedriver_path)
73
- driver = webdriver.Chrome(service=service, options=options)
74
- chrome_version = driver.capabilities["browserVersion"]
75
- chromedriver_version = driver.capabilities["chrome"]["chromedriverVersion"].split()[0]
76
- logger.info(f"Chrome version: {chrome_version}, ChromeDriver version: {chromedriver_version}")
77
- return driver
78
- except Exception as e:
79
- logger.error(f"Driver initialization failed: {str(e)}")
80
- raise Exception(f"Driver initialization failed: {str(e)}")
81
-
82
- @app.post("/transcript", response_model=TranscriptResponse)
83
- async def get_transcript(request: VideoRequest):
84
- start_time = time.time()
85
- driver = None
86
-
87
- try:
88
- video_url = request.url
89
- if not ("youtube.com" in video_url or "youtu.be" in video_url):
90
- raise HTTPException(status_code=400, detail="Invalid YouTube URL")
91
-
92
- driver = init_driver()
93
- logger.info(f"Processing URL: {video_url}")
94
- driver.get(video_url)
95
-
96
- try:
97
- cookie_button = WebDriverWait(driver, 5).until(
98
- EC.element_to_be_clickable((By.XPATH, "//*[contains(text(), 'Accept all')]"))
99
- )
100
- cookie_button.click()
101
- logger.info("Accepted cookies")
102
- except TimeoutException:
103
- logger.info("No cookie consent found")
104
-
105
- logger.info("Clicking 'Show more' button")
106
- more_button = WebDriverWait(driver, 10).until(
107
- EC.element_to_be_clickable((By.ID, "expand"))
108
- )
109
- driver.execute_script("arguments[0].click();", more_button)
110
-
111
- logger.info("Clicking transcript button")
112
- transcript_button = WebDriverWait(driver, 10).until(
113
- EC.element_to_be_clickable((By.CSS_SELECTOR, "button[aria-label='Show transcript']"))
114
- )
115
- driver.execute_script("arguments[0].click();", transcript_button)
116
-
117
- logger.info("Waiting for transcript segments")
118
- WebDriverWait(driver, 15).until(
119
- EC.presence_of_element_located((By.ID, "segments-container"))
120
- )
121
-
122
- logger.info("Extracting transcript")
123
- segments = driver.find_elements(By.CSS_SELECTOR, "div.ytd-transcript-segment-renderer")
124
- transcript = []
125
- for segment in segments:
126
- try:
127
- text = segment.find_element(By.CLASS_NAME, "segment-text").text.strip()
128
- if text:
129
- transcript.append(text)
130
- except:
131
- continue
132
-
133
- if not transcript:
134
- raise HTTPException(status_code=404, detail="No transcript available")
135
-
136
- logger.info(f"Extracted {len(transcript)} transcript segments")
137
- return TranscriptResponse(
138
- success=True,
139
- transcript=transcript,
140
- error=None,
141
- processing_time=time.time() - start_time
142
- )
143
-
144
- except TimeoutException as e:
145
- error_msg = "Timed out waiting for page elements - the video might not have transcripts"
146
- logger.error(error_msg)
147
- return TranscriptResponse(
148
- success=False,
149
- transcript=None,
150
- error=error_msg,
151
- processing_time=time.time() - start_time
152
- )
153
- except Exception as e:
154
- logger.error(f"Error: {str(e)}")
155
- return TranscriptResponse(
156
- success=False,
157
- transcript=None,
158
- error=str(e),
159
- processing_time=time.time() - start_time
160
- )
161
- finally:
162
- if driver:
163
- driver.quit()
164
-
165
- @app.get("/health")
166
- def health_check():
167
- chrome_path = shutil.which("google-chrome")
168
- chromedriver_path = shutil.which("chromedriver")
169
- return {
170
- "ChromePath": chrome_path,
171
- "ChromeDriverPath": chromedriver_path,
172
- "ChromeExists": Path(chrome_path or "").exists(),
173
- "ChromeDriverExists": Path(chromedriver_path or "").exists()
174
- }
175
-
176
- @app.get("/")
177
- async def root():
178
- return {"message": "Welcome to YouTube Transcript API"}
179
-
180
- if __name__ == "__main__":
181
- import uvicorn
182
- uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 7860)))from fastapi import FastAPI, HTTPException
183
- from fastapi.middleware.cors import CORSMiddleware
184
- from pydantic import BaseModel
185
- from selenium import webdriver
186
- from selenium.webdriver.chrome.service import Service
187
- from selenium.webdriver.chrome.options import Options
188
- from selenium.webdriver.common.by import By
189
- from selenium.webdriver.support.ui import WebDriverWait
190
- from selenium.webdriver.support import expected_conditions as EC
191
- from selenium.common.exceptions import TimeoutException
192
- import time
193
- import logging
194
- import os
195
- import shutil
196
- from pathlib import Path
197
-
198
- app = FastAPI()
199
-
200
- # Configure CORS
201
- app.add_middleware(
202
- CORSMiddleware,
203
- allow_origins=["*"],
204
- allow_credentials=True,
205
- allow_methods=["*"],
206
- allow_headers=["*"],
207
- )
208
-
209
- # Configure logging
210
- logging.basicConfig(level=logging.INFO)
211
- logger = logging.getLogger(__name__)
212
-
213
- # Pydantic models
214
- class VideoRequest(BaseModel):
215
- url: str
216
-
217
- class TranscriptResponse(BaseModel):
218
- success: bool
219
- transcript: list[str] | None
220
- error: str | None
221
- processing_time: float
222
-
223
- def init_driver():
224
- options = Options()
225
- options.add_argument("--headless=new")
226
- options.add_argument("--no-sandbox")
227
- options.add_argument("--disable-dev-shm-usage")
228
- options.add_argument("--disable-gpu")
229
 
230
  possible_chrome_paths = [
231
  "/usr/bin/google-chrome",
 
45
  options.add_argument("--no-sandbox")
46
  options.add_argument("--disable-dev-shm-usage")
47
  options.add_argument("--disable-gpu")
48
+ options.add_argument("--disable-extensions")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  possible_chrome_paths = [
51
  "/usr/bin/google-chrome",