The Quick Version#
A web research agent takes a question, decides what to search for, reads the results, and synthesizes an answer with citations. It’s an LLM with two tools: search the web and read a URL. The agent decides when and how to use them.
1
| pip install openai httpx beautifulsoup4
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
| import json
import httpx
from bs4 import BeautifulSoup
from openai import OpenAI
client = OpenAI()
def web_search(query: str, num_results: int = 5) -> list[dict]:
"""Search the web using SerpAPI (or any search API)."""
response = httpx.get(
"https://serpapi.com/search",
params={"q": query, "num": num_results, "api_key": "YOUR_SERPAPI_KEY"},
)
data = response.json()
results = []
for item in data.get("organic_results", [])[:num_results]:
results.append({
"title": item["title"],
"url": item["link"],
"snippet": item.get("snippet", ""),
})
return results
def read_url(url: str, max_chars: int = 8000) -> str:
"""Fetch and extract text content from a URL."""
try:
response = httpx.get(url, timeout=10, follow_redirects=True,
headers={"User-Agent": "ResearchBot/1.0"})
soup = BeautifulSoup(response.text, "html.parser")
for tag in soup(["script", "style", "nav", "footer", "header"]):
tag.decompose()
text = soup.get_text(separator="\n", strip=True)
return text[:max_chars]
except Exception as e:
return f"Error reading URL: {e}"
# Quick test
results = web_search("transformer architecture explained 2024")
print(f"Found {len(results)} results")
content = read_url(results[0]["url"])
print(f"Read {len(content)} chars from {results[0]['url']}")
|
That gives you the two core tools. Now wire them into an agent loop.
The Agent Loop#
The agent uses function calling to decide between searching, reading, or answering. It can do multiple searches, read several pages, and cross-reference information before producing a final answer.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
| tools = [
{
"type": "function",
"function": {
"name": "web_search",
"description": "Search the web for information. Returns titles, URLs, and snippets.",
"parameters": {
"type": "object",
"properties": {
"query": {"type": "string", "description": "Search query"},
},
"required": ["query"],
},
},
},
{
"type": "function",
"function": {
"name": "read_url",
"description": "Read the text content of a web page. Use after searching to get detailed information.",
"parameters": {
"type": "object",
"properties": {
"url": {"type": "string", "description": "URL to read"},
},
"required": ["url"],
},
},
},
]
def handle_tool_call(tool_call) -> str:
name = tool_call.function.name
args = json.loads(tool_call.function.arguments)
if name == "web_search":
results = web_search(args["query"])
return json.dumps(results, indent=2)
elif name == "read_url":
return read_url(args["url"])
return "Unknown tool"
def research(question: str, max_steps: int = 10) -> str:
"""Run the research agent on a question."""
messages = [
{
"role": "system",
"content": (
"You are a research agent. Your job is to answer questions thoroughly "
"using web search and page reading. Always cite your sources with URLs. "
"Search first, then read the most relevant pages, then synthesize an answer. "
"Do multiple searches if the first results don't fully answer the question."
),
},
{"role": "user", "content": question},
]
for step in range(max_steps):
response = client.chat.completions.create(
model="gpt-4o",
messages=messages,
tools=tools,
)
msg = response.choices[0].message
messages.append(msg)
if not msg.tool_calls:
return msg.content # Final answer
for tc in msg.tool_calls:
print(f" Step {step + 1}: {tc.function.name}({tc.function.arguments[:80]}...)")
result = handle_tool_call(tc)
messages.append({
"role": "tool",
"tool_call_id": tc.id,
"content": result,
})
return "Research incomplete — hit step limit."
# Run it
answer = research("What are the latest advances in protein structure prediction since AlphaFold 3?")
print(answer)
|
The agent typically does 2-3 searches and reads 3-5 pages before producing a cited answer. The system prompt is important — without it, the model tends to answer from training data instead of actually searching.
Adding Source Tracking and Citations#
For research you can trust, track every source the agent reads and include proper citations in the output.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
| from dataclasses import dataclass, field
from datetime import datetime
@dataclass
class Source:
url: str
title: str
accessed_at: str = field(default_factory=lambda: datetime.now().isoformat())
snippet: str = ""
class ResearchAgent:
def __init__(self, model: str = "gpt-4o"):
self.client = OpenAI()
self.model = model
self.sources: list[Source] = []
def _handle_tool(self, tool_call) -> str:
name = tool_call.function.name
args = json.loads(tool_call.function.arguments)
if name == "web_search":
results = web_search(args["query"])
for r in results:
self.sources.append(Source(url=r["url"], title=r["title"], snippet=r["snippet"]))
return json.dumps(results, indent=2)
elif name == "read_url":
content = read_url(args["url"])
return content
return "Unknown tool"
def research(self, question: str) -> dict:
self.sources = []
messages = [
{
"role": "system",
"content": (
"You are a research agent. Search the web, read relevant pages, "
"and provide a thorough answer with inline citations like [1], [2]. "
"At the end, list all sources with their URLs."
),
},
{"role": "user", "content": question},
]
for _ in range(12):
response = self.client.chat.completions.create(
model=self.model, messages=messages, tools=tools,
)
msg = response.choices[0].message
messages.append(msg)
if not msg.tool_calls:
unique_sources = {s.url: s for s in self.sources}
return {
"answer": msg.content,
"sources": list(unique_sources.values()),
"num_searches": sum(
1 for m in messages
if hasattr(m, "tool_calls") and m.tool_calls
and any(tc.function.name == "web_search" for tc in m.tool_calls)
),
}
for tc in msg.tool_calls:
result = self._handle_tool(tc)
messages.append({"role": "tool", "tool_call_id": tc.id, "content": result})
return {"answer": "Incomplete", "sources": self.sources, "num_searches": 0}
agent = ResearchAgent()
result = agent.research("Compare the performance of Mamba vs Transformer architectures for long sequences")
print(result["answer"])
print(f"\nSources used: {len(result['sources'])}")
for s in result["sources"]:
print(f" - {s.title}: {s.url}")
|
Parallel Search with Multiple Queries#
Better research often requires searching from different angles. Have the agent generate multiple search queries and run them in parallel.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
| import concurrent.futures
def multi_search(queries: list[str]) -> list[dict]:
"""Run multiple searches in parallel and deduplicate results."""
all_results = []
seen_urls = set()
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
futures = {executor.submit(web_search, q): q for q in queries}
for future in concurrent.futures.as_completed(futures):
for result in future.result():
if result["url"] not in seen_urls:
all_results.append(result)
seen_urls.add(result["url"])
return all_results
# Add as a tool for the agent
multi_search_tool = {
"type": "function",
"function": {
"name": "multi_search",
"description": "Run multiple search queries in parallel. Use when you need to research a topic from different angles.",
"parameters": {
"type": "object",
"properties": {
"queries": {
"type": "array",
"items": {"type": "string"},
"description": "List of search queries to run",
},
},
"required": ["queries"],
},
},
}
|
Rate Limiting and Caching#
Real-world research agents need rate limiting to avoid hammering search APIs and caching to avoid re-reading the same pages.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
| import hashlib
import time
from functools import lru_cache
# Simple in-memory cache for URL content
@lru_cache(maxsize=100)
def cached_read_url(url: str) -> str:
return read_url(url)
# Rate limiter
class RateLimiter:
def __init__(self, calls_per_minute: int = 30):
self.calls_per_minute = calls_per_minute
self.timestamps: list[float] = []
def wait(self):
now = time.time()
self.timestamps = [t for t in self.timestamps if now - t < 60]
if len(self.timestamps) >= self.calls_per_minute:
sleep_time = 60 - (now - self.timestamps[0])
time.sleep(max(sleep_time, 0))
self.timestamps.append(time.time())
limiter = RateLimiter(calls_per_minute=30)
def rate_limited_search(query: str) -> list[dict]:
limiter.wait()
return web_search(query)
|
Common Errors and Fixes#
Agent answers from training data without searching
Strengthen the system prompt: “You MUST search the web before answering. Never answer from memory alone.” Also check that the tool definitions are being passed correctly.
Read URL returns garbage text
Some sites block bots or serve JavaScript-rendered content. Add a fallback: if BeautifulSoup returns less than 100 characters of text, try using the search snippet instead. For JavaScript-heavy sites, use a headless browser like Playwright.
Agent reads too many pages and hits token limits
Set max_chars lower in read_url (4000-6000 chars) and limit the agent to reading 5 pages max. You can also add a “summarize_page” tool that condenses content before adding it to the conversation.
Search API returns irrelevant results
Add date filters to queries for current topics: append “2025 2026” to the query. For technical topics, add “site:github.com OR site:arxiv.org” to focus on primary sources.
Agent loops between searching and reading without answering
Add an explicit instruction: “After reading 3-5 pages, you must provide your final answer. Do not continue searching indefinitely.” The max_steps limit is your safety net.