> ## Documentation Index
> Fetch the complete documentation index at: https://docs.hasdata.com/llms.txt
> Use this file to discover all available pages before exploring further.

# Quickstart - Websites Crawler

The **Websites Crawler** lets you crawl and extract content from multiple pages of a website by following internal links.

You submit one or more starting URLs and define how deep the crawler should go using `maxDepth`. You can also limit which paths should be followed using regex with `includePaths`.

This scraper job is asynchronous. You’ll receive a `jobId`, and results can be fetched via polling or delivered to a webhook.

## Example Request

<CodeGroup>
  ```bash cURL theme={null}
  curl --request POST \
    --url 'https://api.hasdata.com/scrapers/crawler/jobs' \
    --header 'Content-Type: application/json' \
    --header 'x-api-key: <your-api-key>' \
    --data '{"urls":["https://example.com"],"maxDepth":3,"includePaths":"(blog/.+|articles/.+)","outputFormat":["text","json"],"webhook":{"url":"https://yourdomain.com/webhook","events":["scraper.job.started","scraper.job.finished","scraper.data.scraped"]}}'
  ```

  ```javascript Node.js theme={null}
  const axios = require('axios').default;

  const options = {
    method: 'POST',
    url: 'https://api.hasdata.com/scrapers/crawler/jobs',
    headers: {'Content-Type': 'application/json', 'x-api-key': '<your-api-key>'},
    data: {
      urls: ['https://example.com'],
      maxDepth: 3,
      includePaths: '(blog/.+|articles/.+)',
      outputFormat: ['text', 'json'],
      webhook: {
        url: 'https://yourdomain.com/webhook',
        events: ['scraper.job.started', 'scraper.job.finished', 'scraper.data.scraped']
      }
    }
  };

  try {
    const { data } = await axios.request(options);
    console.log(data);
  } catch (error) {
    console.error(error);
  }
  ```

  ```python Python theme={null}
  import requests

  url = "https://api.hasdata.com/scrapers/crawler/jobs"

  payload = {
      "urls": ["https://example.com"],
      "maxDepth": 3,
      "includePaths": "(blog/.+|articles/.+)",
      "outputFormat": ["text", "json"],
      "webhook": {
          "url": "https://yourdomain.com/webhook",
          "events": ["scraper.job.started", "scraper.job.finished", "scraper.data.scraped"]
      }
  }
  headers = {
      "Content-Type": "application/json",
      "x-api-key": "<your-api-key>"
  }

  response = requests.post(url, json=payload, headers=headers)

  print(response.json())
  ```

  ```php PHP theme={null}
  <?php

  $payload = [
      "urls" => ["https://example.com"],
      "maxDepth" => 3,
      "includePaths" => "(blog/.+|articles/.+)",
      "outputFormat" => ["text", "json"],
      "webhook" => ["url" => "https://yourdomain.com/webhook", "events" => ["scraper.job.started", "scraper.job.finished", "scraper.data.scraped"]],
  ];

  $curl = curl_init();

  curl_setopt_array($curl, [
    CURLOPT_URL => "https://api.hasdata.com/scrapers/crawler/jobs",
    CURLOPT_RETURNTRANSFER => true,
    CURLOPT_CUSTOMREQUEST => "POST",
    CURLOPT_POSTFIELDS => json_encode($payload),
    CURLOPT_HTTPHEADER => [
      "Content-Type: application/json",
      "x-api-key: <your-api-key>",
    ],
  ]);

  $response = curl_exec($curl);
  curl_close($curl);

  echo $response;
  ```

  ```java Java theme={null}
  OkHttpClient client = new OkHttpClient();

  MediaType mediaType = MediaType.parse("application/json");
  String json = """
    {
      "urls": [
        "https://example.com"
      ],
      "maxDepth": 3,
      "includePaths": "(blog/.+|articles/.+)",
      "outputFormat": [
        "text",
        "json"
      ],
      "webhook": {
        "url": "https://yourdomain.com/webhook",
        "events": [
          "scraper.job.started",
          "scraper.job.finished",
          "scraper.data.scraped"
        ]
      }
    }
  """;
  RequestBody requestBody = RequestBody.create(json, mediaType);

  Request request = new Request.Builder()
    .url("https://api.hasdata.com/scrapers/crawler/jobs")
    .post(requestBody)
    .addHeader("Content-Type", "application/json")
    .addHeader("x-api-key", "<your-api-key>")
    .build();

  Response response = client.newCall(request).execute();
  ```

  ```csharp C# theme={null}
  using System.Net.Http;
  using System.Text;

  var client = new HttpClient();

  var json = """
  {
    "urls": [
      "https://example.com"
    ],
    "maxDepth": 3,
    "includePaths": "(blog/.+|articles/.+)",
    "outputFormat": [
      "text",
      "json"
    ],
    "webhook": {
      "url": "https://yourdomain.com/webhook",
      "events": [
        "scraper.job.started",
        "scraper.job.finished",
        "scraper.data.scraped"
      ]
    }
  }
  """;
  var content = new StringContent(json, Encoding.UTF8, "application/json");

  var request = new HttpRequestMessage(new HttpMethod("POST"), "https://api.hasdata.com/scrapers/crawler/jobs")
  {
      Content = content,
  };
  request.Headers.Add("x-api-key", "<your-api-key>");

  using var response = await client.SendAsync(request);
  response.EnsureSuccessStatusCode();
  var body = await response.Content.ReadAsStringAsync();
  Console.WriteLine(body);
  ```

  ```ruby Ruby theme={null}
  require 'net/http'
  require 'uri'
  require 'json'

  uri = URI("https://api.hasdata.com/scrapers/crawler/jobs")
  payload = {
    "urls" => ["https://example.com"],
    "maxDepth" => 3,
    "includePaths" => "(blog/.+|articles/.+)",
    "outputFormat" => ["text", "json"],
    "webhook" => {"url" => "https://yourdomain.com/webhook", "events" => ["scraper.job.started", "scraper.job.finished", "scraper.data.scraped"]},
  }

  http = Net::HTTP.new(uri.host, uri.port)
  http.use_ssl = true

  request = Net::HTTP::Post.new(uri)
  request["Content-Type"] = 'application/json'
  request["x-api-key"] = '<your-api-key>'
  request.body = payload.to_json

  response = http.request(request)
  puts response.read_body
  ```

  ```rust Rust theme={null}
  use reqwest::blocking::Client;
  use serde_json::json;

  fn main() -> Result<(), Box<dyn std::error::Error>> {
      let client = Client::new();
      let payload = json!({
          "urls": [
              "https://example.com"
          ],
          "maxDepth": 3,
          "includePaths": "(blog/.+|articles/.+)",
          "outputFormat": [
              "text",
              "json"
          ],
          "webhook": {
              "url": "https://yourdomain.com/webhook",
              "events": [
                  "scraper.job.started",
                  "scraper.job.finished",
                  "scraper.data.scraped"
              ]
          }
      });
      let res = client
          .post("https://api.hasdata.com/scrapers/crawler/jobs")
          .header("Content-Type", "application/json")
          .header("x-api-key", "<your-api-key>")
          .json(&payload)
          .send()?
          .text()?;
      println!("{}", res);
      Ok(())
  }
  ```

  ```go Go theme={null}
  package main

  import (
  	"bytes"
  	"fmt"
  	"io"
  	"net/http"
  )

  func main() {
  	payload := []byte(`{
    "urls": [
      "https://example.com"
    ],
    "maxDepth": 3,
    "includePaths": "(blog/.+|articles/.+)",
    "outputFormat": [
      "text",
      "json"
    ],
    "webhook": {
      "url": "https://yourdomain.com/webhook",
      "events": [
        "scraper.job.started",
        "scraper.job.finished",
        "scraper.data.scraped"
      ]
    }
  }`)

  	req, _ := http.NewRequest("POST", "https://api.hasdata.com/scrapers/crawler/jobs", bytes.NewBuffer(payload))
  	req.Header.Add("Content-Type", "application/json")
  	req.Header.Add("x-api-key", "<your-api-key>")

  	res, _ := http.DefaultClient.Do(req)
  	defer res.Body.Close()

  	responseBody, _ := io.ReadAll(res.Body)
  	fmt.Println(string(responseBody))
  }
  ```
</CodeGroup>

## Use Web Scraping API Params

You can use **any parameters from the Web Scraping API** inside a Websites Crawler job — including:

* `extractRules`
* `aiExtractRules`
* `headers`
* `proxyType` / `proxyCountry`
* `blockResources`, `jsScenario`, `outputFormat`, and [more](/apis/web-scraping-api/api-params)

All parameters are applied to each crawled page individually.

## Get Scraper Job Status

To get the status of an existing scraper job, make a GET request to the endpoint `/scrapers/jobs/:jobId`:

<CodeGroup>
  ```bash cURL theme={null}
  curl --request GET \
    --url 'https://api.hasdata.com/scrapers/jobs/:jobId' \
    --header 'Content-Type: application/json' \
    --header 'x-api-key: <your-api-key>'
  ```

  ```javascript Node.js theme={null}
  const axios = require('axios').default;

  const options = {
    method: 'GET',
    url: 'https://api.hasdata.com/scrapers/jobs/:jobId',
    headers: {'Content-Type': 'application/json', 'x-api-key': '<your-api-key>'}
  };

  try {
    const { data } = await axios.request(options);
    console.log(data);
  } catch (error) {
    console.error(error);
  }
  ```

  ```python Python theme={null}
  import requests

  url = "https://api.hasdata.com/scrapers/jobs/:jobId"

  headers = {
      "Content-Type": "application/json",
      "x-api-key": "<your-api-key>"
  }

  response = requests.get(url, headers=headers)

  print(response.json())
  ```

  ```php PHP theme={null}
  <?php

  $curl = curl_init();

  curl_setopt_array($curl, [
    CURLOPT_URL => "https://api.hasdata.com/scrapers/jobs/:jobId",
    CURLOPT_RETURNTRANSFER => true,
    CURLOPT_CUSTOMREQUEST => "GET",
    CURLOPT_HTTPHEADER => [
      "Content-Type: application/json",
      "x-api-key: <your-api-key>",
    ],
  ]);

  $response = curl_exec($curl);
  curl_close($curl);

  echo $response;
  ```

  ```java Java theme={null}
  OkHttpClient client = new OkHttpClient();

  Request request = new Request.Builder()
    .url("https://api.hasdata.com/scrapers/jobs/:jobId")
    .get()
    .addHeader("Content-Type", "application/json")
    .addHeader("x-api-key", "<your-api-key>")
    .build();

  Response response = client.newCall(request).execute();
  ```

  ```csharp C# theme={null}
  using System.Net.Http;

  var client = new HttpClient();

  var request = new HttpRequestMessage(new HttpMethod("GET"), "https://api.hasdata.com/scrapers/jobs/:jobId");
  request.Headers.Add("x-api-key", "<your-api-key>");

  using var response = await client.SendAsync(request);
  response.EnsureSuccessStatusCode();
  var content = await response.Content.ReadAsStringAsync();
  Console.WriteLine(content);
  ```

  ```ruby Ruby theme={null}
  require 'net/http'
  require 'uri'

  uri = URI("https://api.hasdata.com/scrapers/jobs/:jobId")

  http = Net::HTTP.new(uri.host, uri.port)
  http.use_ssl = true

  request = Net::HTTP::Get.new(uri)
  request["Content-Type"] = 'application/json'
  request["x-api-key"] = '<your-api-key>'

  response = http.request(request)
  puts response.read_body
  ```

  ```rust Rust theme={null}
  use reqwest::blocking::Client;

  fn main() -> Result<(), Box<dyn std::error::Error>> {
      let client = Client::new();
      let res = client
          .get("https://api.hasdata.com/scrapers/jobs/:jobId")
          .header("Content-Type", "application/json")
          .header("x-api-key", "<your-api-key>")
          .send()?
          .text()?;
      println!("{}", res);
      Ok(())
  }
  ```

  ```go Go theme={null}
  package main

  import (
  	"fmt"
  	"io"
  	"net/http"
  )

  func main() {
  	req, _ := http.NewRequest("GET", "https://api.hasdata.com/scrapers/jobs/:jobId", nil)
  	req.Header.Add("Content-Type", "application/json")
  	req.Header.Add("x-api-key", "<your-api-key>")

  	res, _ := http.DefaultClient.Do(req)
  	defer res.Body.Close()

  	body, _ := io.ReadAll(res.Body)
  	fmt.Println(string(body))
  }
  ```
</CodeGroup>

<Accordion title="Response">
  ```json theme={null}
  {
    "id": "dd1a8c53-2d47-4444-977d-8d653a6a3c82",
    "status": "finished",
    "creditsSpent": 200,
    "dataRowsCount": 20,
    "data": {
      "csv": "https://api.hasdata.com/scrapers/jobs/dd1a8c53-2d47-4444-977d-8d653a6a3c82/results/b6cc6733-6d0e-4e44-9e94-38688aad3884.csv",
      "json": "https://api.hasdata.com/scrapers/jobs/dd1a8c53-2d47-4444-977d-8d653a6a3c82/results/9cb592e3-6700-42ff-b58c-e7da3f478f28.json",
      "xlsx": "https://api.hasdata.com/scrapers/jobs/dd1a8c53-2d47-4444-977d-8d653a6a3c82/results/ecea853c-e0ca-4a23-ae74-eea0588e54b6.xlsx"
    },
    "input": {
      "limit": 25,
      "urls": ["https://hasdata.com", "https://example.com"],
      "maxDepth": 5,
      "includePaths": "(blog/.+|articles/.+)",
      "webhook": {
        "url": "https://example.com/webhook",
        "events": ["scraper.job.started", "scraper.job.finished", "scraper.data.scraped"]
      }
    }
  }
  ```
</Accordion>

## Webhook

The webhook will notify you of events related to the scraper job. Here is an example webhook payload for the `scraper.data.scraped` event:

```javascript theme={null}
{
  "event": "scraper.data.scraped",
  "timestamp": "2025-04-11T14:30:00Z",
  "jobId": "dd1a8c53-2d47-4444-977d-8d653a6a3c82",
  "jobStatus": "in_progress",
  "data": [
    {
      "text": "Extracted text here...",
      "statusCode": 200,
      "statusText": "OK",
      "url": "https://hasdata.com/blog",
      "depth": 1,
      "title": "Blog | HasData"
    }
  ]
}
```
