| | import requests |
| | from openai import OpenAI |
| | from smolagents import CodeAgent, OpenAIServerModel, tool, Tool, FinalAnswerTool |
| | from smolagents import WikipediaSearchTool, GoogleSearchTool, VisitWebpageTool, PythonInterpreterTool |
| |
|
| |
|
| | def get_prompt(): |
| | with open("prompt.txt", "r") as f: |
| | return f.read() |
| |
|
| |
|
| | @tool |
| | def visual_qa(image_url: str, question: str) -> str: |
| | """ |
| | Provides functionality to perform visual question answering (VQA) by processing an image and a natural language question. |
| | |
| | Args: |
| | image_url (str): A URL pointing to the location of the image to be analyzed. The URL should be accessible and point to a valid image file. |
| | question: (str): A natural language string containing the question to be answered based on the provided image. |
| | |
| | Returns: |
| | str: The model-generated answer to the provided question based on the analysis of the image. |
| | """ |
| | from openai import OpenAI |
| | client = OpenAI() |
| |
|
| | response = client.chat.completions.create( |
| | model="gpt-4o-mini", |
| | messages=[{ |
| | "role": "user", |
| | "content": [ |
| | {"type": "text", "text": question}, |
| | { |
| | "type": "image_url", |
| | "image_url": { |
| | "url": image_url, |
| | "detail": "low" |
| | }, |
| | }, |
| | ], |
| | }], |
| | ) |
| | return response.choices[0].message.content |
| |
|
| |
|
| | @tool |
| | def transcribe_audio(audio_url: str) -> str: |
| | """ |
| | Provides functionality to perform audio transcription. |
| | |
| | Args: |
| | audio_url (str): A URL pointing to the location of the audio to be analyzed. |
| | |
| | Returns: |
| | str: Audio transcription. |
| | """ |
| | client = OpenAI() |
| | r = client.audio.transcriptions.create( |
| | model="gpt-4o-mini-transcribe", |
| | file=requests.get(audio_url).content, |
| | response_format="text", |
| | ) |
| | return r.text |
| |
|
| |
|
| | class GAIAAgent: |
| | def __init__(self): |
| | self.agent = CodeAgent( |
| | tools=[ |
| | GoogleSearchTool(provider="serper"), |
| | VisitWebpageTool(), |
| | WikipediaSearchTool(), |
| | PythonInterpreterTool(), |
| | FinalAnswerTool(), |
| | visual_qa, |
| | transcribe_audio, |
| | ], |
| | model=OpenAIServerModel(model_id='gpt-4.1-mini', max_tokens=4096, temperature=0), |
| | add_base_tools=False, |
| | max_steps=15, |
| | additional_authorized_imports=["pandas"], |
| | ) |
| | self.prompt = get_prompt() |
| |
|
| | def __call__(self, question: str) -> str: |
| | args = {"question": question} |
| | return self.agent.run(self.prompt, additional_args=args) |
| |
|
| |
|
| | if __name__ == '__main__': |
| | agent = GAIAAgent() |
| |
|