HEX
Server: nginx/1.29.3
System: Linux 11979.bigscoots-wpo.com 6.8.0-88-generic #89-Ubuntu SMP PREEMPT_DYNAMIC Sat Oct 11 01:02:46 UTC 2025 x86_64
User: nginx (1068)
PHP: 7.4.33
Disabled: exec,system,passthru,shell_exec,proc_open,proc_close,popen,show_source,cmd# Do not modify this line # 1684243876
Upload Files
File: //bigscoots/wpo/security/bs-abuseipdb-sync.py
#!/usr/bin/env python3
"""
bs-abuseipdb-sync.py
Two subcommands:

  sync-abuseipdb   Fetch AbuseIPDB blacklist, update the DB, skip whitelisted IPs.
                   Also dynamically fetches official bot IP lists at runtime.
  generate-conf    Read the DB, skip whitelisted IPs, write nginx geo conf file(s).

Whitelist sources:
  - Static:  whitelist.txt (CIDRs, exact IPs, prefix strings)
  - Dynamic: Official JSON feeds from major bots (fetched at sync time)
"""

import argparse
import ipaddress
import json
import os
import sys
import urllib.request
import urllib.error

CONF_MIN    = 100
IP_VER      = 4
LIMIT       = 9999999
PARTIAL_MIN = 40000

ABUSEIPDB_URL = (
    f"https://api.abuseipdb.com/api/v2/blacklist"
    f"?confidenceMinimum={CONF_MIN}&ipVersion={IP_VER}&limit={LIMIT}"
)

# ---------------------------------------------------------------------------
# Dynamic bot IP sources
# Each entry: (label, url, ipv4_key)
# ipv4_key is the key inside each prefix dict that holds the IP/CIDR string
# ---------------------------------------------------------------------------
BOT_SOURCES = [
    # OpenAI
    ("GPTBot",           "https://openai.com/gptbot.json",         "ipv4Prefix"),
    ("ChatGPT-User",     "https://openai.com/chatgpt-user.json",   "ipv4Prefix"),
    ("OAI-SearchBot",    "https://openai.com/searchbot.json",      "ipv4Prefix"),
    # Perplexity
    ("PerplexityBot",    "https://www.perplexity.ai/perplexitybot.json",   "ipv4Prefix"),
    ("Perplexity-User",  "https://www.perplexity.ai/perplexity-user.json", "ipv4Prefix"),
    # Amazon
    ("Amazonbot",        "https://developer.amazon.com/amazonbot/ip-addresses/",          "ipv4Prefix"),
    ("AmazonSearchbot",  "https://developer.amazon.com/amazonbot/searchbot-ip-addresses/", "ip_prefix"),
    ("AmazonLiveBot",    "https://developer.amazon.com/amazonbot/live-ip-addresses/",      "ipv4Prefix"),
    # DuckDuckGo
    ("DuckDuckBot",      "https://duckduckgo.com/duckduckbot.json", "ipv4Prefix"),
    # Googlebot
    ("Googlebot",        "https://developers.google.com/search/apis/ipranges/googlebot.json", "ipv4Prefix"),
    # Bing
    ("Bingbot",          "https://www.bing.com/toolbox/bingbot.json", "ipv4Prefix"),
]


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def send_json(success, message, result):
    print(json.dumps({
        "errors": [], "messages": [],
        "success": success, "result": result, "message": message,
    }, indent=2))


def fail(message):
    send_json(False, message, {})
    sys.exit(1)


def fetch_json(url, timeout=30):
    """Fetch a URL and return parsed JSON, or None on failure."""
    try:
        req = urllib.request.Request(url, headers={"User-Agent": "bs-blocklist-manager/1.0"})
        with urllib.request.urlopen(req, timeout=timeout) as resp:
            return json.loads(resp.read().decode())
    except Exception:
        return None


def fetch_bot_networks(sources):
    """
    Fetch all bot IP sources dynamically.
    Returns a list of ipaddress network objects and a summary dict.
    """
    networks = []
    summary  = {}

    for label, url, key in sources:
        data = fetch_json(url)
        if not data or "prefixes" not in data:
            summary[label] = {"fetched": False, "count": 0}
            continue

        count = 0
        for entry in data["prefixes"]:
            raw = entry.get(key, "")
            if not raw:
                # Try alternate key names
                raw = entry.get("ipv4Prefix") or entry.get("ip_prefix") or ""
            if not raw:
                continue
            # Bare IPs (no CIDR) → treat as /32
            if "/" not in raw:
                raw = raw + "/32"
            try:
                networks.append(ipaddress.ip_network(raw, strict=False))
                count += 1
            except ValueError:
                pass

        summary[label] = {"fetched": True, "count": count}

    return networks, summary


def load_whitelist(path):
    """
    Parse whitelist.txt into three buckets:
      networks  — ipaddress network objects (CIDR lines)
      exact     — set of plain IP strings
      prefixes  — list of raw prefix strings (e.g. "66.249.")
    """
    networks = []
    exact    = set()
    prefixes = []

    try:
        with open(path) as f:
            for line in f:
                entry = line.strip()
                if not entry or entry.startswith("#"):
                    continue
                if "/" in entry:
                    try:
                        networks.append(ipaddress.ip_network(entry, strict=False))
                    except ValueError:
                        pass
                elif entry.count(".") == 3 and all(p.isdigit() for p in entry.split(".")):
                    exact.add(entry)
                else:
                    prefixes.append(entry)
    except FileNotFoundError:
        pass

    return networks, exact, prefixes


def is_whitelisted(ip_str, networks, exact, prefixes):
    if ip_str in exact:
        return True
    for pfx in prefixes:
        if ip_str.startswith(pfx):
            return True
    try:
        addr = ipaddress.ip_address(ip_str)
        for net in networks:
            if addr in net:
                return True
    except ValueError:
        pass
    return False


def fetch_abuseipdb(api_key):
    req = urllib.request.Request(
        ABUSEIPDB_URL,
        headers={"Key": api_key, "Accept": "application/json"},
    )
    try:
        with urllib.request.urlopen(req, timeout=120) as resp:
            return json.loads(resp.read().decode())
    except urllib.error.HTTPError as e:
        fail(f"AbuseIPDB HTTP error: {e.code} {e.reason}")
    except urllib.error.URLError as e:
        fail(f"AbuseIPDB request failed: {e.reason}")


def load_db(path):
    db = {}
    try:
        with open(path) as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                parts = line.split("|", 2)
                if len(parts) == 3:
                    db[parts[0]] = (parts[1], parts[2])
                elif len(parts) == 2:
                    db[parts[0]] = (parts[1], "")
    except FileNotFoundError:
        pass
    return db


def write_db(path, db):
    with open(path, "w") as f:
        for ip, (reason, date) in db.items():
            f.write(f"{ip}|{reason}|{date}\n")


# ---------------------------------------------------------------------------
# Subcommands
# ---------------------------------------------------------------------------

def cmd_sync_abuseipdb(args):
    # --- Load static whitelist ---
    static_nets, exact, prefixes = load_whitelist(args.whitelist)

    # --- Fetch dynamic bot whitelists ---
    print("Fetching bot IP whitelists...", file=sys.stderr)
    bot_nets, bot_summary = fetch_bot_networks(BOT_SOURCES)

    # Merge: all networks to check against
    all_networks = static_nets + bot_nets

    total_bot_ips = sum(v["count"] for v in bot_summary.values())
    failed_sources = [k for k, v in bot_summary.items() if not v["fetched"]]

    def whitelisted(ip_str):
        return is_whitelisted(ip_str, all_networks, exact, prefixes)

    # --- Fetch AbuseIPDB ---
    resp = fetch_abuseipdb(args.api_key)
    data = resp.get("data")
    if not isinstance(data, list):
        fail("Failed to retrieve AbuseIPDB data.")

    generated_at = resp.get("meta", {}).get("generatedAt", "")
    count_api    = len(data)
    abuse_set    = {e["ipAddress"]: e["lastReportedAt"] for e in data}

    db = load_db(args.db)

    # Addition pass
    added = kept = skipped_wl = 0
    new_entries = {}
    for ip, last_reported in abuse_set.items():
        if whitelisted(ip):
            skipped_wl += 1
            continue
        if ip in db:
            kept += 1
        else:
            meta = f"AbuseIPDB - Last Reported at {last_reported} - (ASN lookup skipped)"
            new_entries[ip] = (meta, args.now)
            added += 1
    db.update(new_entries)

    # Removal pass
    removed   = 0
    to_remove = []
    for ip, (reason, date) in db.items():
        if "AbuseIPDB -" not in reason:
            continue
        if whitelisted(ip):
            to_remove.append(ip)
            removed += 1
            continue
        if count_api >= PARTIAL_MIN and ip not in abuse_set:
            to_remove.append(ip)
            removed += 1
    for ip in to_remove:
        del db[ip]

    write_db(args.db, db)

    send_json(True, "AbuseIPDB sync complete.", {
        "generatedAt": generated_at,
        "ranAt": args.now,
        "apiCount": count_api,
        "summary": {
            "added": added,
            "kept": kept,
            "skipped_whitelisted": skipped_wl,
            "removed": removed,
        },
        "botWhitelists": {
            "totalIPs": total_bot_ips,
            "failedSources": failed_sources,
            "sources": bot_summary,
        },
    })


def cmd_generate_conf(args):
    """
    Build nginx geo conf from DB, silently excluding any whitelisted IP.
    Uses static whitelist only (no bot fetching at conf-gen time).
    """
    networks, exact, prefixes = load_whitelist(args.whitelist)
    db = load_db(args.db)

    lines   = []
    skipped = 0
    for ip, (reason, date) in db.items():
        check_ip = ip.split("/")[0]
        if is_whitelisted(check_ip, networks, exact, prefixes):
            skipped += 1
            continue
        lines.append(f"    {ip:<18} \"{reason}\";  # {date}")

    conf = (
        "geo $remote_addr $block_reason {\n"
        "    default \"\";\n"
        + "\n".join(lines) + "\n"
        + "}\n"
    )

    for path in args.out:
        d = os.path.dirname(path)
        if d:
            os.makedirs(d, exist_ok=True)
        with open(path, "w") as f:
            f.write(conf)

    send_json(True, "Conf generated.", {
        "entries_written": len(lines),
        "whitelisted_skipped": skipped,
    })


# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------

def main():
    parser = argparse.ArgumentParser(prog="bs-abuseipdb-sync.py")
    sub    = parser.add_subparsers(dest="command")

    p_sync = sub.add_parser("sync-abuseipdb")
    p_sync.add_argument("--api-key",   required=True)
    p_sync.add_argument("--db",        required=True)
    p_sync.add_argument("--whitelist", required=True)
    p_sync.add_argument("--now",       required=True)

    p_gen = sub.add_parser("generate-conf")
    p_gen.add_argument("--db",        required=True)
    p_gen.add_argument("--whitelist", required=True)
    p_gen.add_argument("--out", required=True, action="append",
                       help="Output path (repeat for multiple files)")

    args = parser.parse_args()

    if not args.command:
        parser.print_help()
        sys.exit(1)

    if args.command == "sync-abuseipdb":
        cmd_sync_abuseipdb(args)
    elif args.command == "generate-conf":
        cmd_generate_conf(args)


if __name__ == "__main__":
    main()