# As a condition of accessing this website, you agree to abide by the following
# content signals:

# (a)  If a content-signal = yes, you may collect content for the corresponding
#      use.
# (b)  If a content-signal = no, you may not collect content for the
#      corresponding use.
# (c)  If the website operator does not include a content signal for a
#      corresponding use, the website operator neither grants nor restricts
#      permission via content signal with respect to the corresponding use.

# The content signals and their meanings are:

# search:   building a search index and providing search results (e.g., returning
#           hyperlinks and short excerpts from your website's contents). Search does not
#           include providing AI-generated search summaries.
# ai-input: inputting content into one or more AI models (e.g., retrieval
#           augmented generation, grounding, or other real-time taking of content for
#           generative AI search answers).
# ai-train: training or fine-tuning AI models.

# ANY RESTRICTIONS EXPRESSED VIA CONTENT SIGNALS ARE EXPRESS RESERVATIONS OF
# RIGHTS UNDER ARTICLE 4 OF THE EUROPEAN UNION DIRECTIVE 2019/790 ON COPYRIGHT
# AND RELATED RIGHTS IN THE DIGITAL SINGLE MARKET.

# BEGIN Cloudflare Managed content

User-Agent: *
Content-signal: search=yes,ai-train=no
Allow: /

User-agent: Amazonbot
Disallow: /

User-agent: Applebot-Extended
Disallow: /

User-agent: Bytespider
Disallow: /

User-agent: CCBot
Disallow: /

User-agent: ClaudeBot
Disallow: /

User-agent: Google-Extended
Disallow: /

User-agent: GPTBot
Disallow: /

User-agent: meta-externalagent
Disallow: /

# END Cloudflare Managed Content

# ---------------------------
# Combined robots.txt for zelmu.com
# Goals: SEO-friendly + reduce server load + protect admin/staging
# Includes major search engines + popular AI / research bots (allowed)
# ---------------------------

# ===== Global default rules (apply to all crawlers unless overridden) =====
User-agent: *
Crawl-delay: 2
Allow: /
# Public pages (explicit for clarity)
Allow: /page/
Allow: /pages
Allow: /user/
Allow: /profile

# Protect admin, auth, checkout, API and sensitive endpoints
Disallow: /admin/
Disallow: /admin/users
Disallow: /admin/pages
Disallow: /admin/audit-logs
Disallow: /login/
Disallow: /register/
Disallow: /account/
Disallow: /checkout/
Disallow: /cart/
Disallow: /api/
Disallow: /internal/
Disallow: /wp-admin/
Disallow: /cgi-bin/

# Avoid crawling infinite/duplicate URLs (common tracking params)
Disallow: /*?*utm_
Disallow: /*?*fbclid
Disallow: /*?*gclid
Disallow: /*?*session=
Disallow: /*?*ref=
Disallow: /*?*sort=
Disallow: /*?*page=

# Reduce crawl of very heavy or low-value folders
Disallow: /assets/large-images/
Disallow: /static/videos/
Disallow: /backup/
Disallow: /cache/
Disallow: /tmp/
Disallow: /download/
Disallow: /exports/

# Optional: block tag/category archives if they cause duplicate content
# Uncomment if your site produces thin archive pages
# Disallow: /tag/
# Disallow: /category/

# ===== Specific crawler directives (overrides) =====

# Google family
User-agent: Googlebot
Allow: /
# Google ignores Crawl-delay; keep server-side rate limiting if needed
Disallow: /admin/
Disallow: /checkout/
Sitemap: https://zelmu.com/sitemap.xml

User-agent: Googlebot-Image
Allow: /images/
Disallow: /assets/large-images/

User-agent: Google-adstxt
Allow: /

# Bing / Microsoft
User-agent: Bingbot
Crawl-delay: 2
Allow: /
Disallow: /admin/

User-agent: BingPreview
Allow: /

# Yahoo / Slurp
User-agent: Slurp
Allow: /

# DuckDuckGo
User-agent: DuckDuckBot
Allow: /

# Yandex
User-agent: Yandex
Crawl-delay: 5
Allow: /
Disallow: /admin/

# Baidu
User-agent: Baiduspider
Allow: /
Crawl-delay: 10

# Sogou, Exalead, Exabot, Soso
User-agent: Sogou Spider
Allow: /

User-agent: Exabot
Allow: /

# Social / Preview bots
User-agent: Facebot
Allow: /

User-agent: LinkedInBot
Allow: /

# SEO / crawler tools (rate-limit or disallow if you don't want them crawling)
User-agent: AhrefsBot
Crawl-delay: 10
# If you prefer to block auditing bots, uncomment:
# Disallow: /

User-agent: SemrushBot
Crawl-delay: 10

User-agent: MJ12bot
Crawl-delay: 10

User-agent: DotBot
Crawl-delay: 10

# Research / AI / answer-engine bots — allowed (per "AI everything")
# These are common user-agent names; some services may use different UA strings.
User-agent: OpenAI
User-agent: OpenAIbot
User-agent: Anthropic
User-agent: AnthropicBot
User-agent: Perplexity
User-agent: PerplexityBot
User-agent: PerplexityAI
User-agent: Poe
User-agent: HuggingFace
User-agent: HuggingFaceBot
User-agent: LLM-Research
User-agent: AIbot
Allow: /

# Generic crawler tools / others
User-agent: ia_archiver
Allow: /

# ===== Staging / dev note (use separate robots.txt on staging domain) =====
# If you host a staging site (staging.zelmu.com), prefer a strict robots.txt there:
# User-agent: *
# Disallow: /
# And protect with HTTP auth / IP allowlist — robots.txt alone isn't secure.

# ===== Sitemap =====
Sitemap: https://zelmu.com/sitemap.xml