migrate agent-proxy to Netlify Edge Function so long sessions stream end-to-end

The reconnect + events.list backfill in c283d88 is correct but never ran:
the previous v2 Node Function was killed at ~27 s (well before the 20 min
reconnect budget could matter), so streams always died after the first MCP
tool batch.

Move the proxy to a Netlify Edge Function (Deno runtime) which has no
streaming-duration cap as long as we keep writing to the response body.
Same reconnect / backfill / dedupe-by-event-id pattern; same NDJSON wire
protocol to the browser. Implemented with plain fetch() against the
Anthropic REST API (npm packages on Edge are beta) so we have no SDK
runtime dependency.

Frontend now POSTs to /api/agent-proxy. The Anthropic SDK is removed
from the package; @netlify/edge-functions is added for ambient types.

Co-Authored-By: alex <alex@semipublic.co>
This commit is contained in:
Devin AI
2026-05-13 12:40:31 +00:00
parent c283d884cf
commit d1c5be112e
6 changed files with 1984 additions and 196 deletions
-435
View File
@@ -1,435 +0,0 @@
import Anthropic from '@anthropic-ai/sdk'
/**
* Netlify Functions v2 handler.
*
* Proxies a request from the React frontend to a Claude Console-defined
* Managed Agent via the /v1/sessions endpoints. The agent's model, system
* prompt, tools, MCP servers, and skills are configured in the Console and
* referenced here by ID.
*
* Wire protocol (downstream to the browser): newline-delimited JSON
* (`application/x-ndjson`). One JSON object per line; the React side
* parses incrementally as bytes arrive. Object shapes:
*
* {"type":"text","text":"...markdown..."} // append to brief
* {"type":"status","kind":"thinking"}
* {"type":"status","kind":"tool_use","name":"...","label":"..."}
* {"type":"status","kind":"tool_result","ok":true}
* {"type":"status","kind":"tool_result","ok":false,"message":"..."}
* {"type":"status","kind":"session_error","message":"..."}
* {"type":"heartbeat"} // keep-alive only
* {"type":"done"} // session terminated cleanly
* {"type":"error","message":"..."} // unrecoverable
*
* Reconnect strategy: the upstream Anthropic SSE stream
* (`GET /v1/sessions/{id}/events/stream`) is not guaranteed to stay open
* for the full life of a multi-turn session — long inter-turn gaps (e.g.
* while MCP tools or the next model request are running) can trigger
* upstream / proxy read timeouts. When the upstream iterator ends without
* a terminal `session.status_*` event we treat it as a drop, fetch any
* events we missed via `sessions.events.list()` (deduped by event id),
* reopen the live stream, and keep going — up to a wall-clock budget.
*/
// All Managed Agents endpoints require this beta header.
const MANAGED_AGENTS_BETA = 'managed-agents-2026-04-01'
// Downstream keep-alive cadence. The browser's fetch reader will time
// out / appear stuck if no bytes flow for too long; we emit a
// `{"type":"heartbeat"}` line when the upstream is quiet.
const HEARTBEAT_MS = 10_000
// Hard ceiling on total wall-clock time the function will keep
// reconnecting upstream on behalf of a single browser request. Past this
// we emit an `error` line and close the stream so the user can retry
// rather than having the request hang indefinitely.
const RECONNECT_BUDGET_MS = 20 * 60 * 1000 // 20 minutes
// Short backoff between an upstream drop and the next reconnect attempt,
// so a tight error loop can't hammer the API.
const RECONNECT_BACKOFF_MS = 500
// ---------------------------------------------------------------------------
// Anthropic client (reused across warm Lambda invocations)
// ---------------------------------------------------------------------------
let _client
function getClient() {
if (_client) return _client
if (!process.env.ANTHROPIC_API_KEY) {
throw new Error('Server is missing ANTHROPIC_API_KEY')
}
_client = new Anthropic({
apiKey: process.env.ANTHROPIC_API_KEY,
defaultHeaders: { 'anthropic-beta': MANAGED_AGENTS_BETA },
})
return _client
}
// A `session.status_idle` with `stop_reason: requires_action` means the
// agent is waiting on a client-side response (custom tool result, tool
// confirmation). For this app we don't expose custom tools, so it should
// never fire — but if it ever does we explicitly do NOT treat it as
// terminal and let the loop keep tailing.
function isTerminal(event) {
if (event?.type === 'session.status_terminated') return true
if (event?.type === 'session.status_idle') {
const t = event.stop_reason?.type
return t === 'end_turn' || t === 'retries_exhausted'
}
return false
}
function describeToolUse(event) {
const name = event.name || 'tool'
const input = event.input || {}
if (name === 'web_search' && input.query) {
return `Searching the web: ${input.query}`
}
if (name === 'web_fetch' && input.url) {
return `Fetching: ${input.url}`
}
if (event.type === 'agent.mcp_tool_use') {
const args = Object.entries(input)
.filter(([k]) => k !== 'limit')
.slice(0, 3)
.map(([k, v]) =>
typeof v === 'object' ? `${k}=…` : `${k}=${String(v).slice(0, 40)}`,
)
.join(', ')
return `${name}${args ? ` (${args})` : ''}`
}
return name
}
// ---------------------------------------------------------------------------
// Handler
// ---------------------------------------------------------------------------
export default async (req /*, context */) => {
if (req.method !== 'POST') {
return new Response('Method Not Allowed', { status: 405 })
}
let payload
try {
payload = await req.json()
} catch {
return new Response('Invalid JSON body', { status: 400 })
}
const {
stationName = '',
stationLocation = '',
stationWebsite = '',
} = payload || {}
// Server-side URL validation (mirrors the client-side regex).
const URL_REGEX =
/^(https?:\/\/)?([\w-]+\.)+[\w-]{2,}(\/[\w\-._~:/?#[\]@!$&'()*+,;=%]*)?$/i
if (!URL_REGEX.test(stationWebsite)) {
return new Response('Invalid station website URL', { status: 400 })
}
const AGENT_ID = process.env.ANTHROPIC_AGENT_ID
const ENVIRONMENT_ID = process.env.ANTHROPIC_ENVIRONMENT_ID
const VAULT_IDS = (process.env.ANTHROPIC_VAULT_IDS || '')
.split(',')
.map((s) => s.trim())
.filter(Boolean)
const missing = []
if (!AGENT_ID) missing.push('ANTHROPIC_AGENT_ID')
if (!ENVIRONMENT_ID) missing.push('ANTHROPIC_ENVIRONMENT_ID')
if (missing.length) {
return new Response(`Server is missing env var(s): ${missing.join(', ')}`, {
status: 500,
})
}
let client
try {
client = getClient()
} catch (err) {
return new Response(
`Failed to initialize agent: ${err.message || err}`,
{ status: 500 },
)
}
const userMessage = `Here is a new public media station intake:
- Station Name: ${stationName}
- Station Location: ${stationLocation}
- Station Website: ${stationWebsite}
Please follow your instructions to produce the funding outlook brief.`
// ----- Create the session -----
let session
try {
const sessionParams = {
agent: AGENT_ID,
environment_id: ENVIRONMENT_ID,
}
if (VAULT_IDS.length) sessionParams.vault_ids = VAULT_IDS
session = await client.beta.sessions.create(sessionParams)
} catch (err) {
return new Response(
`Failed to create session: ${err.message || err}`,
{ status: 500 },
)
}
// ----- Open the event stream BEFORE sending the user message -----
// (Stream-first ensures we don't miss any events the agent emits.)
let initialUpstream
try {
initialUpstream = await client.beta.sessions.events.stream(session.id)
} catch (err) {
return new Response(
`Failed to open session event stream: ${err.message || err}`,
{ status: 500 },
)
}
// ----- Send the kickoff user.message event -----
try {
await client.beta.sessions.events.send(session.id, {
events: [
{
type: 'user.message',
content: [{ type: 'text', text: userMessage }],
},
],
})
} catch (err) {
try {
initialUpstream.controller?.abort?.()
} catch {
/* ignore */
}
return new Response(
`Failed to send user message: ${err.message || err}`,
{ status: 500 },
)
}
const encoder = new TextEncoder()
const seenEventIds = new Set()
const deadline = Date.now() + RECONNECT_BUDGET_MS
const body = new ReadableStream({
async start(controller) {
let lastSendAt = Date.now()
const writeJson = (obj) => {
try {
controller.enqueue(encoder.encode(JSON.stringify(obj) + '\n'))
lastSendAt = Date.now()
} catch {
/* controller closed */
}
}
// Heartbeat to keep the downstream connection alive while we're
// waiting on the next upstream event (e.g. during a long MCP tool
// call or before the next model_request_start).
const heartbeat = setInterval(() => {
if (Date.now() - lastSendAt >= HEARTBEAT_MS) {
writeJson({ type: 'heartbeat' })
}
}, HEARTBEAT_MS / 2)
const handle = (event) => {
switch (event.type) {
case 'agent.message': {
if (Array.isArray(event.content)) {
for (const block of event.content) {
if (block?.type === 'text' && block.text) {
writeJson({ type: 'text', text: block.text + '\n\n' })
}
}
}
break
}
case 'agent.thinking': {
writeJson({ type: 'status', kind: 'thinking' })
break
}
case 'agent.tool_use':
case 'agent.mcp_tool_use': {
writeJson({
type: 'status',
kind: 'tool_use',
name: event.name || 'tool',
label: describeToolUse(event),
})
break
}
case 'agent.tool_result':
case 'agent.mcp_tool_result': {
if (event.is_error) {
const msg =
(Array.isArray(event.content) &&
event.content[0]?.text) ||
'tool error'
writeJson({
type: 'status',
kind: 'tool_result',
ok: false,
message: String(msg).slice(0, 300),
})
} else {
writeJson({ type: 'status', kind: 'tool_result', ok: true })
}
break
}
case 'session.error': {
const msg = event.error?.message || 'unknown session error'
writeJson({
type: 'status',
kind: 'session_error',
message: msg,
})
break
}
}
}
let done = false
let currentStream = initialUpstream
let iteration = 0
try {
while (!done) {
iteration++
// On reconnect (every iteration after the first), reopen the
// live stream and backfill anything we missed during the gap.
if (iteration > 1) {
if (Date.now() >= deadline) {
writeJson({
type: 'error',
message: `Stream reconnect budget (${Math.round(
RECONNECT_BUDGET_MS / 60_000,
)} min) exhausted. The session may still be running — try again or check the Console.`,
})
break
}
await new Promise((r) => setTimeout(r, RECONNECT_BACKOFF_MS))
try {
currentStream = await client.beta.sessions.events.stream(
session.id,
)
} catch (err) {
writeJson({
type: 'error',
message: `Failed to reopen event stream: ${err?.message || err}`,
})
break
}
// Backfill: pull everything the session has emitted so far
// and dedupe by event.id. Auto-paginated by the SDK.
try {
for await (const event of client.beta.sessions.events.list(
session.id,
{ order: 'asc' },
)) {
if (event.id && !seenEventIds.has(event.id)) {
seenEventIds.add(event.id)
handle(event)
}
// Terminal checks must run even for already-seen events,
// or a terminal event that came in via the backfill gets
// skipped and the loop never exits.
if (isTerminal(event)) {
done = true
break
}
}
} catch (err) {
writeJson({
type: 'status',
kind: 'session_error',
message: `Backfill failed: ${err?.message || err}`,
})
// Don't break — fall through and try the live stream.
}
if (done) break
}
// Tail the currently-open live stream until it ends, errors,
// or hits a terminal session event.
try {
for await (const event of currentStream) {
if (event.id && !seenEventIds.has(event.id)) {
seenEventIds.add(event.id)
handle(event)
}
if (isTerminal(event)) {
done = true
break
}
}
} catch {
// Treat as transient. If it's actually persistent, the
// reopen + backfill on the next iteration will surface it
// (or burn down the reconnect budget cleanly).
}
}
if (done) {
writeJson({ type: 'done' })
}
} catch (err) {
writeJson({
type: 'error',
message: err?.message || String(err),
})
} finally {
clearInterval(heartbeat)
try {
currentStream?.controller?.abort?.()
} catch {
/* ignore */
}
try {
controller.close()
} catch {
/* ignore */
}
}
},
cancel() {
// Browser disconnected. Best-effort: abort whichever upstream is
// currently open. (`currentStream` lives inside `start()` so we
// only have the initial one in scope here — that's enough to make
// sure we don't keep a socket open after the very first turn.)
try {
initialUpstream.controller?.abort?.()
} catch {
/* ignore */
}
},
})
return new Response(body, {
status: 200,
headers: {
'Content-Type': 'application/x-ndjson; charset=utf-8',
'Cache-Control': 'no-cache, no-transform',
'X-Accel-Buffering': 'no',
},
})
}
export const config = {
path: '/.netlify/functions/agent-proxy',
}