> ## Documentation Index
> Fetch the complete documentation index at: https://developer.upsun.com/llms.txt
> Use this file to discover all available pages before exploring further.

# The (not so) hidden cost of AI scrapers

> AI scrapers drive up your hosting cost while real users wait. Use robots.txt, cache normalization, and sane application limits to take the pressure off.


export const PostMeta = ({data = {}}) => {
  const {author, date, image} = data;
  const authors = Array.isArray(author) ? author : author ? [author] : [];
  const resolveAuthor = slug => {
    const entry = AUTHOR_MAP[slug] || ({});
    const name = entry.name || slug;
    const github = entry.github || null;
    const linkedin = entry.linkedin || null;
    const url = github ? `https://github.com/${github}` : linkedin || null;
    const avatarUrl = github ? `https://github.com/${github}.png?size=64` : null;
    return {
      name,
      url,
      avatarUrl
    };
  };
  const formattedDate = date ? new Date(date).toLocaleDateString('en-US', {
    year: 'numeric',
    month: 'long',
    day: 'numeric'
  }) : null;
  if (!image && authors.length === 0 && !formattedDate) return null;
  const AUTHOR_MAP = {
    "aaron-collier": {
      "name": "Aaron Collier"
    },
    "aaron-dudenhofer": {
      "name": "Aaron Dudenhofer"
    },
    "aaron-porter": {
      "name": "Aaron Porter"
    },
    "adriaan-odendaal": {
      "name": "Adriaan Odendaal"
    },
    "ajmal": {
      "name": "Ajmal Siddiqui"
    },
    "akalipetis": {
      "name": "Antonis Kalipetis"
    },
    "alexander-varwijk": {
      "name": "Alexander Varwijk"
    },
    "alicia-bevilacqua": {
      "name": "Alicia Bevilacqua"
    },
    "amelie-deguerry": {
      "name": "Amelie Deguerry"
    },
    "anacidre": {
      "name": "Ana Cidre",
      "linkedin": "https://www.linkedin.com/in/ana-cidre"
    },
    "andoni": {
      "name": "Andoni Auzmendi"
    },
    "andrei-taranu": {
      "name": "Andrei (Alex) Taranu",
      "linkedin": "https://www.linkedin.com/in/andrei-alex-taranu/"
    },
    "andrew-baxter": {
      "name": "Andrew Baxter"
    },
    "andrew-melck": {
      "name": "Andrew Melck"
    },
    "antoine-crochet-damais": {
      "name": "Antoine Crochet Damais"
    },
    "augustin-delaporte": {
      "name": "Augustin Delaporte",
      "linkedin": "https://www.linkedin.com/in/augustindelaporte/"
    },
    "branislav-bujisic": {
      "name": "Branislav Bujisic"
    },
    "carl-smith": {
      "name": "Carl Smith"
    },
    "caroline-leroy": {
      "name": "Caroline Leroy"
    },
    "cati-mayer": {
      "name": "Cati Mayer"
    },
    "catplat": {
      "name": "C Trinkwon"
    },
    "ceelolulu": {
      "name": "Celeste van der Watt"
    },
    "chadwcarlson": {
      "name": "Chad Carlson",
      "github": "chadwcarlson",
      "linkedin": "https://www.linkedin.com/in/chadwcarlson"
    },
    "chris-ward": {
      "name": "Chris Ward"
    },
    "chris-yates": {
      "name": "Chris Yates"
    },
    "christian-sieber": {
      "name": "Christian Sieber"
    },
    "christopher-lockheardt": {
      "name": "Christopher Lockheardt"
    },
    "christopher-skene": {
      "name": "Christopher Skene"
    },
    "chuck-morgan": {
      "name": "Chuck Morgan"
    },
    "corey-dockendorf": {
      "name": "Corey Dockendorf"
    },
    "crell": {
      "name": "Crell"
    },
    "damz": {
      "name": "Damz"
    },
    "dan-morrison": {
      "name": "Dan Morrison"
    },
    "davidbonachera": {
      "name": "David Bonachera",
      "github": "davidbonachera",
      "linkedin": "https://www.linkedin.com/in/davidbonachera"
    },
    "dereliahmet1": {
      "name": "Ahmet Faruk Dereli"
    },
    "devicezero": {
      "name": "Jonas Kröger",
      "github": "devicezero",
      "linkedin": "https://www.linkedin.com/in/jonaskroeger/"
    },
    "doug-goldberg": {
      "name": "Doug Goldberg"
    },
    "duncan-naves": {
      "name": "Duncan Naves",
      "github": "duncannaves",
      "linkedin": "https://www.linkedin.com/in/duncan-naves-a94423aa"
    },
    "erika-bustamante": {
      "name": "Erika Bustamante"
    },
    "fabpot": {
      "name": "Fabien Potencier"
    },
    "flovntp": {
      "name": "Florent Huck",
      "github": "flovntp",
      "linkedin": "https://www.linkedin.com/in/florenthuck"
    },
    "fred-plais": {
      "name": "Fred Plais"
    },
    "gauthier-garnier": {
      "name": "Gauthier Garnier"
    },
    "gilzow": {
      "name": "Paul Gilzow"
    },
    "gmoigneu": {
      "name": "Guillaume Moigneu",
      "github": "gmoigneu",
      "linkedin": "https://www.linkedin.com/in/guillaumemoigneu/"
    },
    "gregqualls": {
      "name": "Greg Qualls"
    },
    "guguss": {
      "name": "Augustin Delaporte"
    },
    "haylee-millar": {
      "name": "Haylee Millar"
    },
    "ivana-kotur": {
      "name": "Ivana Kotur"
    },
    "jackrabbithanna": {
      "name": "Mark Hanna"
    },
    "jared-wright": {
      "name": "Jared Wright",
      "github": "jww-sh",
      "linkedin": "https://www.linkedin.com/in/jaredwaynewright"
    },
    "jessica-orozco": {
      "name": "Jessica Orozco"
    },
    "joey-stanford": {
      "name": "Joey Stanford"
    },
    "john-grubb": {
      "name": "John Grubb"
    },
    "jonas-kruger": {
      "name": "Jonas Kruger"
    },
    "kathryn-frazer": {
      "name": "Kathryn Frazer"
    },
    "kemiojo": {
      "name": "Kemi Elizabeth Ojogbede"
    },
    "kieronsambrook-smith": {
      "name": "Kieronsambrook Smith"
    },
    "laurent-arnoud": {
      "name": "Laurent Arnoud"
    },
    "letoya-boyne": {
      "name": "Letoya Boyne"
    },
    "lolautruche": {
      "name": "Jérôme Vieilledent"
    },
    "lyly-lepinay": {
      "name": "Lyly Lepinay"
    },
    "manauwar-alam": {
      "name": "Manauwar Alam"
    },
    "marc-antoine-porri": {
      "name": "Marc Antoine Porri"
    },
    "maria-antinkaapo": {
      "name": "Maria Antinkaapo"
    },
    "maria-de-anton": {
      "name": "Maria De Anton"
    },
    "mark-dorison": {
      "name": "Mark Dorison"
    },
    "markus-hausammann": {
      "name": "Markus Hausammann"
    },
    "mary-thomas": {
      "name": "Mary Thomas"
    },
    "mathias-bolt-lesniak": {
      "name": "Mathias Bolt Lesniak"
    },
    "mathieu-strauch": {
      "name": "Mathieu Strauch"
    },
    "matthias-van-woensel": {
      "name": "Matthias Van Woensel",
      "linkedin": "https://www.linkedin.com/in/matthias-van-woensel-267a069"
    },
    "michael-sharp": {
      "name": "Michael Sharp"
    },
    "mupsi": {
      "name": "Marine Gandy"
    },
    "natalie-harper": {
      "name": "Natalie Harper"
    },
    "ngommenginger": {
      "name": "Nicolas Gommenginger",
      "linkedin": "https://www.linkedin.com/in/nicolas-gommenginger"
    },
    "nicholas-bennison": {
      "name": "Nicholas Bennison"
    },
    "nicholas-vahalik": {
      "name": "Nicholas Vahalik"
    },
    "nick-hardiman": {
      "name": "Nick Hardiman"
    },
    "nickanderegg": {
      "name": "Nickanderegg"
    },
    "nicolas-grekas": {
      "name": "Nicolas Grekas",
      "github": "nicolas-grekas",
      "linkedin": "https://www.linkedin.com/in/nicolasgrekas/"
    },
    "niti-malwade": {
      "name": "Niti Malwade"
    },
    "opensocialteam": {
      "name": "Opensocialteam"
    },
    "ori-pekelman": {
      "name": "Ori Pekelman"
    },
    "otavio-santana": {
      "name": "Otavio Santana"
    },
    "palwandi": {
      "name": "Pawan Alwandi",
      "github": "pawpy",
      "linkedin": "https://www.linkedin.com/in/pawanalwandi"
    },
    "patrick-boest": {
      "name": "Patrick Boest"
    },
    "patrick-dawkins": {
      "name": "Patrick Dawkins",
      "github": "pjcdawkins",
      "linkedin": "https://www.linkedin.com/in/patrickdawkins"
    },
    "patrick-klima": {
      "name": "Patrick Klima"
    },
    "pjcdawkins": {
      "name": "Pjcdawkins"
    },
    "prineet-kaurbhurji": {
      "name": "Prineet Kaurbhurji"
    },
    "quentin-sinig": {
      "name": "Quentin Sinig"
    },
    "ralt": {
      "name": "Florian Margaine",
      "github": "ralt",
      "linkedin": "https://www.linkedin.com/in/florian-margaine-43971136"
    },
    "ramanathanramakrishnamurthy": {
      "name": "Ramanathanramakrishnamurthy"
    },
    "remi-lejeune": {
      "name": "Rémi Lejeune"
    },
    "ribel": {
      "name": "Taras Kruts"
    },
    "robert-douglass": {
      "name": "Robert Douglass"
    },
    "rudy-weber": {
      "name": "Rudy Weber"
    },
    "ryan-hicks": {
      "name": "Ryan Hicks"
    },
    "sabri-helal": {
      "name": "Sabri Helal"
    },
    "savannah-bergeron": {
      "name": "Savannah Bergeron"
    },
    "shannon-vettes": {
      "name": "Shannon Vettes"
    },
    "shawn-ogasawara": {
      "name": "Shawn Ogasawara",
      "linkedin": "https://www.linkedin.com/in/shawn-ogasawara-83a9a0/"
    },
    "shawna-spoor": {
      "name": "Shawna Spoor"
    },
    "shedrack-akintayo": {
      "name": "Shedrack Akintayo"
    },
    "simon-ruggier": {
      "name": "Simon Ruggier"
    },
    "sophie-van-der-kindere": {
      "name": "Sophie Van Der Kindere"
    },
    "stefanos-thampis": {
      "name": "Stefanos Thampis"
    },
    "stephen-weinberg": {
      "name": "Stephen Weinberg"
    },
    "sukhman-virk": {
      "name": "Sukhman Virk"
    },
    "sumaira-nazir": {
      "name": "Sumaira Nazir"
    },
    "sumer": {
      "name": "Sümer Cip"
    },
    "syed-raza": {
      "name": "Syed Raza"
    },
    "tamara-bacchia": {
      "name": "Tamara Bacchia"
    },
    "tara-arnold": {
      "name": "Tara Arnold"
    },
    "theosakamg": {
      "name": "Mickael Gaillard",
      "github": "theosakamg"
    },
    "thomasdiluccio": {
      "name": "Thomas di Luccio"
    },
    "tim-anderson": {
      "name": "Tim Anderson"
    },
    "tom-helmer-hansen": {
      "name": "Tom Helmer Hansen"
    },
    "tylermills": {
      "name": "Tyler Mills"
    },
    "upsun": {
      "name": "Upsun"
    },
    "veronika-tolkachova": {
      "name": "Veronika Tolkachova",
      "linkedin": "https://www.linkedin.com/in/veronika-tolkachova-169167a2"
    },
    "vince-parker": {
      "name": "Vince Parker"
    },
    "vinnie-russo": {
      "name": "Vincenzo Russo"
    },
    "vrobert78": {
      "name": "Vincent Robert",
      "github": "vrobert78",
      "linkedin": "https://www.linkedin.com/in/vincent-robert-498a883"
    },
    "yuriy-babenko": {
      "name": "Yuriy Babenko"
    },
    "yuriy-gerasimov": {
      "name": "Yuriy Gerasimov"
    }
  };
  return <div className="post-meta">
      {(authors.length > 0 || formattedDate) && <div className="post-meta-info">
          {authors.length > 0 && <div className="post-meta-authors">
              {authors.map(slug => {
    const {name, url, avatarUrl} = resolveAuthor(slug);
    const inner = <>
                    {avatarUrl && <img src={avatarUrl} alt={name} className="post-meta-avatar" />}
                    <span className="post-meta-author-name">{name}</span>
                  </>;
    return url ? <a key={slug} href={url} target="_blank" rel="noopener noreferrer" className="post-meta-author">
                    {inner}
                  </a> : <span key={slug} className="post-meta-author">{inner}</span>;
  })}
            </div>}
          {authors.length > 0 && formattedDate && <span className="post-meta-separator" aria-hidden="true">·</span>}
          {formattedDate && <span className="post-meta-date">{formattedDate}</span>}
        </div>}
      {image && <img src={image} alt="" className="post-meta-image" aria-hidden="true" />}
    </div>;
};

<PostMeta data={{ author: ["ralt"], date: "2026-05-05T09:00:00.000Z" }} />

Your hosting bill keeps growing. Your servers feel busier than they should. Real users sometimes wait too long for a page. Then you check the access logs and realize the traffic isn't who you thought it was.

Most of it is bots. Specifically, AI scrapers and aggressive crawlers, walking your application end-to-end, generating database queries, blowing through your cache, and quietly inflating your infrastructure cost.

You're not alone. Customers run into this regularly, and the problem is rarely visible until the bill arrives.

## Bots will follow every link, no matter how deep

One pattern keeps showing up: pages with no natural endpoint. A calendar that links back month by month is the textbook example. A bot finds the "previous month" link and follows it. The next page has another "previous month" link, and the bot follows that one too. And on, and on. One customer's calendar generated those links all the way back to the 1700s, and the bots dutifully followed.

The shape is what matters here, not the calendar. Any "previous/next" navigation without a lower bound creates the same trap: archive paginations, year/month/day drill-downs, infinite tag pages.

This isn't malicious. The bot is well-behaved. The site never told it to stop.

## Faceted listings turn into millions of URLs

The other pattern is just as expensive. Picture a product listing with facets: brand, screen size, memory, color, price. Every facet adds a query parameter. A crawler that tries every link on the page eventually tries every permutation of every facet combination.

A dozen facets with a few values each means millions of URLs. All of them hit your application server. All of them render full product listings against your database. None of them get cached, because each URL looks unique.

Each individual request is fine. The aggregate is what hurts your infrastructure budget.

## Where the cost lands

Bot traffic uses the same application servers, the same database, and the same cache as your real users. When that capacity runs hot, you either scale up or your real users start waiting. Either way, you pay for it.

The unpleasant part is that you rarely see the bot tax broken out. Your hosting bill says "compute time" or "database hours". It doesn't say "rendered the same product page 80,000 times for crawlers in a single afternoon". You see the impact as a slowly growing infrastructure spend, the kind of trend that looks like natural growth.

Auto-scaling makes this worse, not better. A platform that quietly adds capacity to absorb crawler traffic will keep you online and keep growing the bill. The site doesn't go down. The graphs look fine. The invoice arrives.

The bots don't notice. Your users do.

## The robots.txt fix

Well-behaved scrapers, which is to say almost all the ones causing this kind of pressure, do read your `robots.txt`. You can tell them which URL patterns to skip. For the faceted-listing case, a single line goes a long way:

```text theme={null}
  User-agent: *
  Disallow: /*?*=*
```

That tells crawlers to ignore any URL with query parameters, anywhere on the site. If only some pages have this problem, scope the rule down:

```text theme={null}
  User-agent: *
  Disallow: /products?
```

There's a trade-off here. The same `Disallow` line that stops AI scrapers from drilling into every facet combination also stops them from indexing those URLs at all. That used to matter mostly for Google search results. Now it also matters for what ChatGPT, Claude, and Perplexity recommend when someone asks. Everyone wants to be what the LLMs recommend.

The fix isn't to give up on `robots.txt`, it's to make sure the products stay reachable through paths the rules don't cover. Link to every product from the main listing page. Add a `sitemap.xml` that points search engines and AI crawlers at the canonical product URLs. Bots that should index your catalog follow those. When the same bots see a `Disallow` line, they politely oblige and skip the URLs you asked them to skip. A `robots.txt` is a polite request, not a wall, and that's exactly enough.

Robots.txt changes don't take effect immediately. Most crawlers re-fetch it every day or two. Expect bot traffic to drop within a few days, not within minutes. Sites that were struggling under crawler load tend to return to normal once the rules propagate.

## Caching, with sorted keys

Even after `robots.txt` is in place, some traffic still comes through, and some of it hits pages with query parameters. Two URLs that differ only in parameter order are functionally identical, but a naive cache treats them as different keys.

The fix is to normalize URLs before they reach the cache, wherever the cache lives: CDN, reverse proxy, or application layer. Sort query parameters alphabetically, and strip the ones you don't care about (tracking IDs, session tokens). Hundreds of variants collapse into a single cache entry.

If you happen to be using Varnish, [Varnish 103: Cache Optimization with URL Normalization on Upsun](/posts/hands-on/varnish-103-cache-optimization-with-url-normalization-on-upsun) walks through the VCL-level details.

## Limits in the application itself

The deepest fix isn't a configuration file. It's the application asking itself: should this link exist?

The 1700s calendar problem isn't really a robots.txt problem. It's a UX problem that happens to also be a crawler problem. Nobody wants to see events from the 18th century. The "previous month" link only needs to keep working as long as you have meaningful events to show. Past that point, return a 404, stop rendering the link, or redirect to the earliest month with content.

The same logic applies to faceted product pages. If a facet combination produces zero results, you don't need a unique URL for it. Render a "no products match" page that doesn't link out to other empty combinations.

These are small changes. They remove the infinite hallway that bots will otherwise wander down.

## A new normal, not a villain story

The bots are doing what their owners asked them to do, on inputs their owners didn't anticipate. The websites were built before AI scrapers existed at scale. Their authors assumed the only mechanical traffic they'd see was the occasional Googlebot pass. Both sides are catching up to a new normal.

The good news: the fixes are mostly cheap and mostly under your control. A `robots.txt` that matches the shape of your site. HTML meta directives like `<meta name="robots" content="noindex">` and `rel="nofollow"` on the links you don't want followed. Application limits where infinite navigation doesn't make sense. A cache that knows the difference between meaningful URL variation and noise. All of these work because the same well-behaved bots that overcrawl your site also read your instructions. Apply the ones that fit your application, and most of the bot pressure goes away on its own.

Your hosting bill should follow.
