File size: 1,001 Bytes
a2709ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import requests
from bs4 import BeautifulSoup
from langchain.tools import Tool

def scrape_webpage(url: str) -> str:
    """
    Fetches the textual content of a webpage.
    """
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (compatible; AI-Agent/1.0)"
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, "html.parser")
        # Extract main text content (without scripts, styles)
        for script in soup(["script", "style"]):
            script.decompose()
        text = " ".join(soup.stripped_strings)
        return text[:3000]  # Limit the output length to avoid overloading the LLM

    except Exception as e:
        return f"[WebScraper error: {e}]"

# Register as a LangChain Tool
web_scraper_tool = Tool.from_function(
    name="web_scraper",
    description="Fetches and extracts main text content from a webpage using its URL.",
    func=scrape_webpage
)