From f12eccc8674b2645628e438336664e979d4094d0 Mon Sep 17 00:00:00 2001 From: FrancescoSaverioZuppichini Date: Thu, 28 May 2026 11:08:15 +0200 Subject: [PATCH 1/4] Add Vercel AI SDK integration docs --- docs.json | 1 + integrations/vercel_ai.mdx | 262 +++++++++++++++++++++++++++---------- logo/vercel.svg | 9 ++ 3 files changed, 202 insertions(+), 70 deletions(-) create mode 100644 logo/vercel.svg diff --git a/docs.json b/docs.json index 2e312bf..1482bb7 100644 --- a/docs.json +++ b/docs.json @@ -96,6 +96,7 @@ { "group": "Frameworks", "pages": [ + "integrations/vercel_ai", "integrations/langchain", "integrations/langgraph", "integrations/llamaindex", diff --git a/integrations/vercel_ai.mdx b/integrations/vercel_ai.mdx index 9a4e3f9..196a040 100644 --- a/integrations/vercel_ai.mdx +++ b/integrations/vercel_ai.mdx @@ -1,108 +1,230 @@ --- -title: "⚡ Vercel AI" -description: "Integrate ScrapeGraphAI into Vercel AI" +title: "Vercel AI SDK" +description: "Use ScrapeGraphAI as first-party tools inside Vercel AI SDK agents" +icon: "/logo/vercel.svg" --- ## Overview -[Vercel AI SDK](https://ai-sdk.dev/) is a popular JavaScript/TypeScript framework to interact with various LLM providers. This page shows how to integrate it with ScrapeGraph. +`@scrapegraph-ai/ai-sdk` exposes ScrapeGraphAI endpoints as Vercel AI SDK tools. Add the tools to `generateText` or `streamText`, set `stopWhen`, and the model can scrape, extract, search, crawl, and monitor web data during the run. - - View the Vercel AI SDK documentation - + + + Official Vercel AI SDK documentation + + + How AI SDK Core tools are executed + + ## Installation -Follow our [JavaScript SDK installation steps](/sdks/javascript) using your favourite package manager: +Install the ScrapeGraphAI tool package, the AI SDK, and the model provider you use: ```bash -# Using npm -npm i scrapegraph-js +npm i @scrapegraph-ai/ai-sdk ai @ai-sdk/openai +pnpm add @scrapegraph-ai/ai-sdk ai @ai-sdk/openai +yarn add @scrapegraph-ai/ai-sdk ai @ai-sdk/openai +bun add @scrapegraph-ai/ai-sdk ai @ai-sdk/openai +``` -# Using pnpm -pnpm i scrapegraph-js +Set your keys: -# Using yarn -yarn add scrapegraph-js +```bash +export SGAI_API_KEY="your-scrapegraph-key" +export OPENAI_API_KEY="your-openai-key" +``` + + +The tools read `SGAI_API_KEY` from the environment by default. You can also pass `{ apiKey: process.env.SGAI_API_KEY }` to any tool factory. + + +## Quickstart -# Using bun -bun add scrapegraph-js +Give the model a scrape tool and allow multiple steps so it can call the tool, receive the result, then write the final answer. + +```ts +import { openai } from "@ai-sdk/openai"; +import { generateText, stepCountIs } from "ai"; +import { scrapeTool } from "@scrapegraph-ai/ai-sdk"; + +const { text } = await generateText({ + model: openai("gpt-5-nano"), + prompt: + "Scrape Hacker News and write a short, concise summary of what people are talking about today.", + tools: { + scrape: scrapeTool(), + }, + stopWhen: stepCountIs(3), +}); + +console.log(text); ``` -Then, install [Vercel AI](https://ai-sdk.dev/docs/getting-started) with their [OpenAI provider](https://ai-sdk.dev/providers/ai-sdk-providers/openai): +## Available tools -```bash -# Using npm -npm i ai @ai-sdk/openai +| Factory | What it gives the model | +|---|---| +| `scrapeTool()` | Scrape a page as markdown, HTML, JSON, links, images, summary, branding, or screenshot | +| `extractTool()` | Extract structured JSON from a URL, HTML, or markdown with a prompt | +| `searchTool()` | Search the web and optionally extract structured data from results | +| `crawlTools()` | Start, poll, page through, stop, resume, and delete crawl jobs | +| `monitorTools()` | Create, list, update, pause, resume, delete, and inspect monitor activity | -# Using pnpm -pnpm i ai @ai-sdk/openai +Use a narrow tool set when the task is specific. Use all tools when the agent needs to decide the workflow: -# Using yarn -yarn add ai @ai-sdk/openai +```ts +import { openai } from "@ai-sdk/openai"; +import { generateText, stepCountIs } from "ai"; +import { + crawlTools, + extractTool, + monitorTools, + scrapeTool, + searchTool, +} from "@scrapegraph-ai/ai-sdk"; + +const { text } = await generateText({ + model: openai("gpt-5-nano"), + prompt: "Search for ScrapeGraphAI docs, scrape the best page, and summarize it.", + tools: { + scrape: scrapeTool(), + extract: extractTool(), + search: searchTool(), + ...crawlTools(), + ...monitorTools(), + }, + stopWhen: stepCountIs(10), +}); -# Using bun -bun add ai @ai-sdk/openai +console.log(text); ``` -## Usage +## Scrape example -The ScrapeGraph SDK can be used like any other tool. See [Vercel AI tool calling docs](https://ai-sdk.dev/docs/ai-sdk-core/tools-and-tool-calling). +This is the smallest useful agent: one scrape tool, a concrete target, and enough steps for the model to call the tool before answering. ```ts -import { z } from "zod"; -import { generateText, tool } from "ai"; import { openai } from "@ai-sdk/openai"; -import { extract } from "scrapegraph-js"; +import { generateText, stepCountIs } from "ai"; +import { scrapeTool } from "@scrapegraph-ai/ai-sdk"; const result = await generateText({ - model: openai("gpt-4.1-mini"), + model: openai("gpt-5-nano"), + prompt: "Find the main headline on https://example.com", tools: { - scrape: tool({ - description: "Extract articles information from a given URL.", - parameters: z.object({ - url: z.string().describe("The exact URL."), - }), - execute: async ({ url }) => { - const response = await extract(process.env.SGAI_API_KEY!, { - url, - prompt: "Extract the article information", - schema: { - type: "object", - properties: { - articles: { - type: "array", - items: { - type: "object", - properties: { - title: { type: "string" }, - author: { type: "string" }, - publishDate: { type: "string" }, - content: { type: "string" }, - category: { type: "string" }, - }, - }, - }, - }, - }, - }); - return response.data?.json; - }, - }), + scrape: scrapeTool(), }, - prompt: "Can you find me the articles on https://scrapegraphai.com/blog?", + stopWhen: stepCountIs(5), }); -console.log(result); +console.log(result.text); ``` -## Support +Pass an API key explicitly when your runtime does not expose environment variables: -Need help with the integration? +```ts +const tools = { + scrape: scrapeTool({ apiKey: process.env.SGAI_API_KEY }), +}; +``` + +## Crawl example + +`crawlTools()` gives the model the full async crawl loop: start the job, poll status with `getCrawl`, then retrieve paginated pages with `getCrawlPages`. + +```ts +import { openai } from "@ai-sdk/openai"; +import { generateText, stepCountIs } from "ai"; +import { crawlTools } from "@scrapegraph-ai/ai-sdk"; + +const { text, steps } = await generateText({ + model: openai("gpt-5-nano"), + prompt: + "Find 10 https://scrapegraphai.com/ blog posts. Start a crawl, poll its status, fetch crawled pages with getCrawlPages, then summarize what you found.", + tools: { + ...crawlTools(), + }, + stopWhen: stepCountIs(20), +}); + +for (const step of steps) { + for (const toolCall of step.toolCalls) { + console.log(`[tool] ${toolCall.toolName}`); + console.log(JSON.stringify(toolCall.input, null, 2)); + } +} + +console.log(text); +``` + +For longer crawls, keep the same tools but add your app's own timeout, cancellation, and persistence around the AI SDK call. + +## Tool reference + +### Scrape + +```ts +import { scrapeTool } from "@scrapegraph-ai/ai-sdk"; + +const tools = { + scrape: scrapeTool(), +}; +``` + +### Extract + +```ts +import { extractTool } from "@scrapegraph-ai/ai-sdk"; + +const tools = { + extract: extractTool(), +}; +``` + +### Search + +```ts +import { searchTool } from "@scrapegraph-ai/ai-sdk"; + +const tools = { + search: searchTool(), +}; +``` + +### Crawl + +```ts +import { crawlTools } from "@scrapegraph-ai/ai-sdk"; + +const tools = { + ...crawlTools(), +}; +``` + +`crawlTools()` registers `startCrawl`, `getCrawl`, `getCrawlPages`, `stopCrawl`, `resumeCrawl`, and `deleteCrawl`. + +### Monitor + +```ts +import { monitorTools } from "@scrapegraph-ai/ai-sdk"; + +const tools = { + ...monitorTools(), +}; +``` + +`monitorTools()` registers `createMonitor`, `listMonitors`, `getMonitor`, `updateMonitor`, `deleteMonitor`, `pauseMonitor`, `resumeMonitor`, and `getMonitorActivity`. + +## Support + + + + + + + + From cc0caef87578ee06e15e65fe75d316ea3f6b1ce9 Mon Sep 17 00:00:00 2001 From: FrancescoSaverioZuppichini Date: Thu, 28 May 2026 11:09:35 +0200 Subject: [PATCH 2/4] Add explicit Vercel AI SDK docs link --- integrations/vercel_ai.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integrations/vercel_ai.mdx b/integrations/vercel_ai.mdx index 196a040..294f78e 100644 --- a/integrations/vercel_ai.mdx +++ b/integrations/vercel_ai.mdx @@ -6,11 +6,11 @@ icon: "/logo/vercel.svg" ## Overview -`@scrapegraph-ai/ai-sdk` exposes ScrapeGraphAI endpoints as Vercel AI SDK tools. Add the tools to `generateText` or `streamText`, set `stopWhen`, and the model can scrape, extract, search, crawl, and monitor web data during the run. +`@scrapegraph-ai/ai-sdk` exposes ScrapeGraphAI endpoints as [Vercel AI SDK](https://ai-sdk.dev/docs/introduction) tools. Add the tools to `generateText` or `streamText`, set `stopWhen`, and the model can scrape, extract, search, crawl, and monitor web data during the run. From 3e36159301fe0883ee1a5d63a1ab1f357b49f22f Mon Sep 17 00:00:00 2001 From: FrancescoSaverioZuppichini Date: Thu, 28 May 2026 11:12:23 +0200 Subject: [PATCH 3/4] Add missing crawl pages API reference --- api-reference/endpoint/crawl/pages.mdx | 103 +++++++++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 api-reference/endpoint/crawl/pages.mdx diff --git a/api-reference/endpoint/crawl/pages.mdx b/api-reference/endpoint/crawl/pages.mdx new file mode 100644 index 0000000..eacecd4 --- /dev/null +++ b/api-reference/endpoint/crawl/pages.mdx @@ -0,0 +1,103 @@ +--- +title: 'Get crawl pages' +description: 'Fetch paginated crawl pages with resolved scrape results.' +--- + +```http +GET https://v2-api.scrapegraphai.com/api/crawl/:id/pages +``` + +Returns a cursor-paginated slice of crawl pages for a job started with [`POST /api/crawl`](/api-reference/endpoint/crawl/start). Each returned page includes its lightweight crawl metadata and, when available, the resolved `scrape` result for that page. + +Use this endpoint for page content. Keep [`GET /api/crawl/:id`](/api-reference/endpoint/crawl/get-status) for lightweight status polling. + +## Path parameters + + + The crawl job UUID returned by `POST /api/crawl`. + + +## Query parameters + + + Number of crawl pages to return in this response. Minimum `1`, maximum `100`. + + + + Zero-based index cursor. `0` starts at the first crawl page. Use the `pagination.nextCursor` value from the previous response to fetch the next slice. + + +### Pagination behavior + +`limit` controls the page size. If you omit it, the API returns up to `50` crawl pages. `cursor` is an index into the ordered crawl page list, not an opaque token. For example: + +```bash +# First 50 crawl pages +curl -X GET "https://v2-api.scrapegraphai.com/api/crawl/:id/pages?limit=50&cursor=0" \ + -H "SGAI-APIKEY: $SGAI_API_KEY" + +# If the response returns "nextCursor": "50", fetch the next 50 +curl -X GET "https://v2-api.scrapegraphai.com/api/crawl/:id/pages?limit=50&cursor=50" \ + -H "SGAI-APIKEY: $SGAI_API_KEY" +``` + +When `pagination.nextCursor` is `null`, there are no more crawl pages to fetch. + +## Example request + +```bash +curl -X GET "https://v2-api.scrapegraphai.com/api/crawl/79694e03-f2ea-43f2-93cc-7c6fc26f999a/pages?limit=50&cursor=0" \ + -H "SGAI-APIKEY: $SGAI_API_KEY" +``` + +## Example response + +```json +{ + "data": [ + { + "url": "https://example.com", + "depth": 0, + "title": "", + "status": "completed", + "parentUrl": null, + "contentType": "text/html", + "links": ["https://iana.org/domains/example"], + "scrapeRefId": "83a911ed-c0bc-4a8c-ad62-8efeeb93f33a", + "scrape": { + "results": { + "markdown": { + "data": ["# Example Domain\n\nThis domain is for use in illustrative examples..."] + } + }, + "metadata": { + "contentType": "text/html" + } + } + } + ], + "pagination": { + "limit": 50, + "nextCursor": null + } +} +``` + +| Field | Description | +|-------|-------------| +| `data[]` | Ordered crawl pages for this slice. | +| `data[].scrapeRefId` | UUID of the underlying Scrape request. | +| `data[].scrape` | Resolved Scrape response for the page, when the page has a `scrapeRefId` and the result is available. | +| `pagination.limit` | Echo of the requested page size. | +| `pagination.nextCursor` | Cursor for the next request, or `null` when there are no more pages. | + + +`scrape` is resolved by default. There is no `expand` or `populate` query parameter. If you only need one page's underlying Scrape request, you can also fetch `data[].scrapeRefId` with [`GET /api/history/:id`](/api-reference/endpoint/history). + + +## Related + +- Start a job: [`POST /api/crawl`](/api-reference/endpoint/crawl/start) +- Poll status: [`GET /api/crawl/:id`](/api-reference/endpoint/crawl/get-status) +- Fetch one underlying scrape: [`GET /api/history/:id`](/api-reference/endpoint/history) +- Stop / resume / delete: [Manage crawl jobs](/api-reference/endpoint/crawl/manage) From 4c5cfe752f14a21ab91e40f977ec2dabd63ff37b Mon Sep 17 00:00:00 2001 From: FrancescoSaverioZuppichini Date: Thu, 28 May 2026 11:14:26 +0200 Subject: [PATCH 4/4] Revert "Add missing crawl pages API reference" This reverts commit 3e36159301fe0883ee1a5d63a1ab1f357b49f22f. --- api-reference/endpoint/crawl/pages.mdx | 103 ------------------------- 1 file changed, 103 deletions(-) delete mode 100644 api-reference/endpoint/crawl/pages.mdx diff --git a/api-reference/endpoint/crawl/pages.mdx b/api-reference/endpoint/crawl/pages.mdx deleted file mode 100644 index eacecd4..0000000 --- a/api-reference/endpoint/crawl/pages.mdx +++ /dev/null @@ -1,103 +0,0 @@ ---- -title: 'Get crawl pages' -description: 'Fetch paginated crawl pages with resolved scrape results.' ---- - -```http -GET https://v2-api.scrapegraphai.com/api/crawl/:id/pages -``` - -Returns a cursor-paginated slice of crawl pages for a job started with [`POST /api/crawl`](/api-reference/endpoint/crawl/start). Each returned page includes its lightweight crawl metadata and, when available, the resolved `scrape` result for that page. - -Use this endpoint for page content. Keep [`GET /api/crawl/:id`](/api-reference/endpoint/crawl/get-status) for lightweight status polling. - -## Path parameters - - - The crawl job UUID returned by `POST /api/crawl`. - - -## Query parameters - - - Number of crawl pages to return in this response. Minimum `1`, maximum `100`. - - - - Zero-based index cursor. `0` starts at the first crawl page. Use the `pagination.nextCursor` value from the previous response to fetch the next slice. - - -### Pagination behavior - -`limit` controls the page size. If you omit it, the API returns up to `50` crawl pages. `cursor` is an index into the ordered crawl page list, not an opaque token. For example: - -```bash -# First 50 crawl pages -curl -X GET "https://v2-api.scrapegraphai.com/api/crawl/:id/pages?limit=50&cursor=0" \ - -H "SGAI-APIKEY: $SGAI_API_KEY" - -# If the response returns "nextCursor": "50", fetch the next 50 -curl -X GET "https://v2-api.scrapegraphai.com/api/crawl/:id/pages?limit=50&cursor=50" \ - -H "SGAI-APIKEY: $SGAI_API_KEY" -``` - -When `pagination.nextCursor` is `null`, there are no more crawl pages to fetch. - -## Example request - -```bash -curl -X GET "https://v2-api.scrapegraphai.com/api/crawl/79694e03-f2ea-43f2-93cc-7c6fc26f999a/pages?limit=50&cursor=0" \ - -H "SGAI-APIKEY: $SGAI_API_KEY" -``` - -## Example response - -```json -{ - "data": [ - { - "url": "https://example.com", - "depth": 0, - "title": "", - "status": "completed", - "parentUrl": null, - "contentType": "text/html", - "links": ["https://iana.org/domains/example"], - "scrapeRefId": "83a911ed-c0bc-4a8c-ad62-8efeeb93f33a", - "scrape": { - "results": { - "markdown": { - "data": ["# Example Domain\n\nThis domain is for use in illustrative examples..."] - } - }, - "metadata": { - "contentType": "text/html" - } - } - } - ], - "pagination": { - "limit": 50, - "nextCursor": null - } -} -``` - -| Field | Description | -|-------|-------------| -| `data[]` | Ordered crawl pages for this slice. | -| `data[].scrapeRefId` | UUID of the underlying Scrape request. | -| `data[].scrape` | Resolved Scrape response for the page, when the page has a `scrapeRefId` and the result is available. | -| `pagination.limit` | Echo of the requested page size. | -| `pagination.nextCursor` | Cursor for the next request, or `null` when there are no more pages. | - - -`scrape` is resolved by default. There is no `expand` or `populate` query parameter. If you only need one page's underlying Scrape request, you can also fetch `data[].scrapeRefId` with [`GET /api/history/:id`](/api-reference/endpoint/history). - - -## Related - -- Start a job: [`POST /api/crawl`](/api-reference/endpoint/crawl/start) -- Poll status: [`GET /api/crawl/:id`](/api-reference/endpoint/crawl/get-status) -- Fetch one underlying scrape: [`GET /api/history/:id`](/api-reference/endpoint/history) -- Stop / resume / delete: [Manage crawl jobs](/api-reference/endpoint/crawl/manage)