File: //bigscoots/wpo/security/bs-abuseipdb-sync.py
#!/usr/bin/env python3
"""
bs-abuseipdb-sync.py
Two subcommands:
sync-abuseipdb Fetch AbuseIPDB blacklist, update the DB, skip whitelisted IPs.
Also dynamically fetches official bot IP lists at runtime.
generate-conf Read the DB, skip whitelisted IPs, write nginx geo conf file(s).
Whitelist sources:
- Static: whitelist.txt (CIDRs, exact IPs, prefix strings)
- Dynamic: Official JSON feeds from major bots (fetched at sync time)
"""
import argparse
import ipaddress
import json
import os
import sys
import urllib.request
import urllib.error
CONF_MIN = 100
IP_VER = 4
LIMIT = 9999999
PARTIAL_MIN = 40000
ABUSEIPDB_URL = (
f"https://api.abuseipdb.com/api/v2/blacklist"
f"?confidenceMinimum={CONF_MIN}&ipVersion={IP_VER}&limit={LIMIT}"
)
# ---------------------------------------------------------------------------
# Dynamic bot IP sources
# Each entry: (label, url, ipv4_key)
# ipv4_key is the key inside each prefix dict that holds the IP/CIDR string
# ---------------------------------------------------------------------------
BOT_SOURCES = [
# OpenAI
("GPTBot", "https://openai.com/gptbot.json", "ipv4Prefix"),
("ChatGPT-User", "https://openai.com/chatgpt-user.json", "ipv4Prefix"),
("OAI-SearchBot", "https://openai.com/searchbot.json", "ipv4Prefix"),
# Perplexity
("PerplexityBot", "https://www.perplexity.ai/perplexitybot.json", "ipv4Prefix"),
("Perplexity-User", "https://www.perplexity.ai/perplexity-user.json", "ipv4Prefix"),
# Amazon
("Amazonbot", "https://developer.amazon.com/amazonbot/ip-addresses/", "ipv4Prefix"),
("AmazonSearchbot", "https://developer.amazon.com/amazonbot/searchbot-ip-addresses/", "ip_prefix"),
("AmazonLiveBot", "https://developer.amazon.com/amazonbot/live-ip-addresses/", "ipv4Prefix"),
# DuckDuckGo
("DuckDuckBot", "https://duckduckgo.com/duckduckbot.json", "ipv4Prefix"),
# Googlebot
("Googlebot", "https://developers.google.com/search/apis/ipranges/googlebot.json", "ipv4Prefix"),
# Bing
("Bingbot", "https://www.bing.com/toolbox/bingbot.json", "ipv4Prefix"),
]
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def send_json(success, message, result):
print(json.dumps({
"errors": [], "messages": [],
"success": success, "result": result, "message": message,
}, indent=2))
def fail(message):
send_json(False, message, {})
sys.exit(1)
def fetch_json(url, timeout=30):
"""Fetch a URL and return parsed JSON, or None on failure."""
try:
req = urllib.request.Request(url, headers={"User-Agent": "bs-blocklist-manager/1.0"})
with urllib.request.urlopen(req, timeout=timeout) as resp:
return json.loads(resp.read().decode())
except Exception:
return None
def fetch_bot_networks(sources):
"""
Fetch all bot IP sources dynamically.
Returns a list of ipaddress network objects and a summary dict.
"""
networks = []
summary = {}
for label, url, key in sources:
data = fetch_json(url)
if not data or "prefixes" not in data:
summary[label] = {"fetched": False, "count": 0}
continue
count = 0
for entry in data["prefixes"]:
raw = entry.get(key, "")
if not raw:
# Try alternate key names
raw = entry.get("ipv4Prefix") or entry.get("ip_prefix") or ""
if not raw:
continue
# Bare IPs (no CIDR) → treat as /32
if "/" not in raw:
raw = raw + "/32"
try:
networks.append(ipaddress.ip_network(raw, strict=False))
count += 1
except ValueError:
pass
summary[label] = {"fetched": True, "count": count}
return networks, summary
def load_whitelist(path):
"""
Parse whitelist.txt into three buckets:
networks — ipaddress network objects (CIDR lines)
exact — set of plain IP strings
prefixes — list of raw prefix strings (e.g. "66.249.")
"""
networks = []
exact = set()
prefixes = []
try:
with open(path) as f:
for line in f:
entry = line.strip()
if not entry or entry.startswith("#"):
continue
if "/" in entry:
try:
networks.append(ipaddress.ip_network(entry, strict=False))
except ValueError:
pass
elif entry.count(".") == 3 and all(p.isdigit() for p in entry.split(".")):
exact.add(entry)
else:
prefixes.append(entry)
except FileNotFoundError:
pass
return networks, exact, prefixes
def is_whitelisted(ip_str, networks, exact, prefixes):
if ip_str in exact:
return True
for pfx in prefixes:
if ip_str.startswith(pfx):
return True
try:
addr = ipaddress.ip_address(ip_str)
for net in networks:
if addr in net:
return True
except ValueError:
pass
return False
def fetch_abuseipdb(api_key):
req = urllib.request.Request(
ABUSEIPDB_URL,
headers={"Key": api_key, "Accept": "application/json"},
)
try:
with urllib.request.urlopen(req, timeout=120) as resp:
return json.loads(resp.read().decode())
except urllib.error.HTTPError as e:
fail(f"AbuseIPDB HTTP error: {e.code} {e.reason}")
except urllib.error.URLError as e:
fail(f"AbuseIPDB request failed: {e.reason}")
def load_db(path):
db = {}
try:
with open(path) as f:
for line in f:
line = line.strip()
if not line:
continue
parts = line.split("|", 2)
if len(parts) == 3:
db[parts[0]] = (parts[1], parts[2])
elif len(parts) == 2:
db[parts[0]] = (parts[1], "")
except FileNotFoundError:
pass
return db
def write_db(path, db):
with open(path, "w") as f:
for ip, (reason, date) in db.items():
f.write(f"{ip}|{reason}|{date}\n")
# ---------------------------------------------------------------------------
# Subcommands
# ---------------------------------------------------------------------------
def cmd_sync_abuseipdb(args):
# --- Load static whitelist ---
static_nets, exact, prefixes = load_whitelist(args.whitelist)
# --- Fetch dynamic bot whitelists ---
print("Fetching bot IP whitelists...", file=sys.stderr)
bot_nets, bot_summary = fetch_bot_networks(BOT_SOURCES)
# Merge: all networks to check against
all_networks = static_nets + bot_nets
total_bot_ips = sum(v["count"] for v in bot_summary.values())
failed_sources = [k for k, v in bot_summary.items() if not v["fetched"]]
def whitelisted(ip_str):
return is_whitelisted(ip_str, all_networks, exact, prefixes)
# --- Fetch AbuseIPDB ---
resp = fetch_abuseipdb(args.api_key)
data = resp.get("data")
if not isinstance(data, list):
fail("Failed to retrieve AbuseIPDB data.")
generated_at = resp.get("meta", {}).get("generatedAt", "")
count_api = len(data)
abuse_set = {e["ipAddress"]: e["lastReportedAt"] for e in data}
db = load_db(args.db)
# Addition pass
added = kept = skipped_wl = 0
new_entries = {}
for ip, last_reported in abuse_set.items():
if whitelisted(ip):
skipped_wl += 1
continue
if ip in db:
kept += 1
else:
meta = f"AbuseIPDB - Last Reported at {last_reported} - (ASN lookup skipped)"
new_entries[ip] = (meta, args.now)
added += 1
db.update(new_entries)
# Removal pass
removed = 0
to_remove = []
for ip, (reason, date) in db.items():
if "AbuseIPDB -" not in reason:
continue
if whitelisted(ip):
to_remove.append(ip)
removed += 1
continue
if count_api >= PARTIAL_MIN and ip not in abuse_set:
to_remove.append(ip)
removed += 1
for ip in to_remove:
del db[ip]
write_db(args.db, db)
send_json(True, "AbuseIPDB sync complete.", {
"generatedAt": generated_at,
"ranAt": args.now,
"apiCount": count_api,
"summary": {
"added": added,
"kept": kept,
"skipped_whitelisted": skipped_wl,
"removed": removed,
},
"botWhitelists": {
"totalIPs": total_bot_ips,
"failedSources": failed_sources,
"sources": bot_summary,
},
})
def cmd_generate_conf(args):
"""
Build nginx geo conf from DB, silently excluding any whitelisted IP.
Uses static whitelist only (no bot fetching at conf-gen time).
"""
networks, exact, prefixes = load_whitelist(args.whitelist)
db = load_db(args.db)
lines = []
skipped = 0
for ip, (reason, date) in db.items():
check_ip = ip.split("/")[0]
if is_whitelisted(check_ip, networks, exact, prefixes):
skipped += 1
continue
lines.append(f" {ip:<18} \"{reason}\"; # {date}")
conf = (
"geo $remote_addr $block_reason {\n"
" default \"\";\n"
+ "\n".join(lines) + "\n"
+ "}\n"
)
for path in args.out:
d = os.path.dirname(path)
if d:
os.makedirs(d, exist_ok=True)
with open(path, "w") as f:
f.write(conf)
send_json(True, "Conf generated.", {
"entries_written": len(lines),
"whitelisted_skipped": skipped,
})
# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------
def main():
parser = argparse.ArgumentParser(prog="bs-abuseipdb-sync.py")
sub = parser.add_subparsers(dest="command")
p_sync = sub.add_parser("sync-abuseipdb")
p_sync.add_argument("--api-key", required=True)
p_sync.add_argument("--db", required=True)
p_sync.add_argument("--whitelist", required=True)
p_sync.add_argument("--now", required=True)
p_gen = sub.add_parser("generate-conf")
p_gen.add_argument("--db", required=True)
p_gen.add_argument("--whitelist", required=True)
p_gen.add_argument("--out", required=True, action="append",
help="Output path (repeat for multiple files)")
args = parser.parse_args()
if not args.command:
parser.print_help()
sys.exit(1)
if args.command == "sync-abuseipdb":
cmd_sync_abuseipdb(args)
elif args.command == "generate-conf":
cmd_generate_conf(args)
if __name__ == "__main__":
main()