Use live web search as your RAG retrieval layer
Most RAG tutorials retrieve from a static vector database built from your own documents. But when your agent needs current information — today's news, recent research, live product data — a vector store is the wrong tool. Superhighway gives you live web retrieval over HTTP, so you can feed fresh results straight into your LLM context without any embedding pipeline.
The pattern
Traditional RAG:
query → embed → vector DB → top-k chunks → LLM
Web-retrieval RAG with Superhighway:
query → Superhighway /search or /research → structured results → LLM
The retrieval step is a live HTTP call instead of a vector lookup. No ingestion, no embedding model, no index to maintain. Results are always fresh.
Standalone Python (no framework)
import os, requests
API_KEY = os.environ["SUPERHIGHWAY_API_KEY"]
BASE = "https://superhighway.walls.sh"
def retrieve(query: str, n: int = 5) -> list[dict]:
r = requests.get(f"{BASE}/search", params={"q": query, "count": n},
headers={"Authorization": f"Bearer {API_KEY}"}, timeout=15)
r.raise_for_status()
return r.json()["results"] # [{title, url, description}, ...]
def rag_answer(question: str, client) -> str:
results = retrieve(question)
context = "\n\n".join(
f"[{i+1}] {r['title']}\n{r['url']}\n{r['description']}"
for i, r in enumerate(results)
)
messages = [
{"role": "system", "content": "Answer the question using only the search results below. Cite sources by number."},
{"role": "user", "content": f"Search results:\n{context}\n\nQuestion: {question}"},
]
return client.chat.completions.create(
model="claude-opus-4-8",
max_tokens=1024,
messages=messages,
).content[0].text
# from anthropic import Anthropic
# print(rag_answer("What happened in AI this week?", Anthropic()))
One-call deep retrieval: /research
For questions that benefit from reading multiple pages, use /research. It searches, fetches the top result pages, and synthesises them — all in one call. Use this as your retrieval step when you need paragraph-level content, not just snippets:
def retrieve_deep(query: str) -> str:
r = requests.get(f"{BASE}/research", params={"q": query},
headers={"Authorization": f"Bearer {API_KEY}"}, timeout=30)
r.raise_for_status()
return r.json()["answer"] # already synthesised — pass directly to your LLM as context
# Now your RAG pipeline is one extra HTTP call, not a whole embedding stack
context = retrieve_deep("latest advances in protein folding 2025")
Expect 5–15 s for /research — it's reading pages in real time. Set your HTTP client timeout to 30 s.
LangChain LCEL retriever
from langchain_core.retrievers import BaseRetriever
from langchain_core.documents import Document
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
import requests, os
class SuperhighwayRetriever(BaseRetriever):
api_key: str = os.environ.get("SUPERHIGHWAY_API_KEY", "")
base_url: str = "https://superhighway.walls.sh"
k: int = 5
def _get_relevant_documents(self, query: str, *, run_manager: CallbackManagerForRetrieverRun) -> list[Document]:
r = requests.get(f"{self.base_url}/search",
params={"q": query, "count": self.k},
headers={"Authorization": f"Bearer {self.api_key}"}, timeout=15)
r.raise_for_status()
return [
Document(page_content=res["description"], metadata={"title": res["title"], "url": res["url"]})
for res in r.json()["results"]
]
retriever = SuperhighwayRetriever()
prompt = ChatPromptTemplate.from_template(
"Answer the question using these web search results:\n{context}\n\nQuestion: {question}"
)
def format_docs(docs):
return "\n\n".join(f"[{d.metadata['title']}]({d.metadata['url']})\n{d.page_content}" for d in docs)
chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| ChatOpenAI(model="gpt-4o")
| StrOutputParser()
)
answer = chain.invoke("What are the top AI tools released this month?")
print(answer)
LlamaIndex query engine
from llama_index.core import QueryBundle
from llama_index.core.retrievers import BaseRetriever
from llama_index.core.schema import NodeWithScore, TextNode
from llama_index.core.query_engine import RetrieverQueryEngine
import requests, os
class SuperhighwayRetriever(BaseRetriever):
def __init__(self, api_key: str, base_url: str = "https://superhighway.walls.sh", k: int = 5):
self._api_key = api_key
self._base_url = base_url
self._k = k
super().__init__()
def _retrieve(self, query_bundle: QueryBundle) -> list[NodeWithScore]:
r = requests.get(f"{self._base_url}/search",
params={"q": query_bundle.query_str, "count": self._k},
headers={"Authorization": f"Bearer {self._api_key}"}, timeout=15)
r.raise_for_status()
return [
NodeWithScore(node=TextNode(
text=res["description"],
metadata={"title": res["title"], "url": res["url"]}
), score=1.0)
for res in r.json()["results"]
]
retriever = SuperhighwayRetriever(api_key=os.environ["SUPERHIGHWAY_API_KEY"])
query_engine = RetrieverQueryEngine.from_args(retriever=retriever)
response = query_engine.query("What are the latest developments in quantum computing?")
print(response)
When to use web retrieval vs. a vector store
| Use case | Retrieval approach |
|---|---|
| Current events, news, live data | Superhighway /search or /research |
| Your own docs, PDFs, code bases | Vector store (Chroma, Pinecone, Weaviate) |
| Mix of fresh web + your docs | Hybrid: Superhighway + vector store in parallel, then merge |
| Deep synthesis from multiple pages | Superhighway /research (reads pages for you) |
Install
pip install requests langchain-core langchain-openai # LangChain path
pip install requests llama-index # LlamaIndex path
Get your API key at /pricing (free tier: 1,000 calls/month). Set it as SUPERHIGHWAY_API_KEY. For more integration options see the full guide list.