Traditional web monitoring tools check if a page is "up" or "down." But modern businesses need to detect meaningful changes: a competitor adjusting prices, a regulatory update, a product going out of stock, or a fraudulent listing appearing. By combining proxies for reliable data collection with LLMs for intelligent analysis, you can build monitoring pipelines that understand context and alert you only when it matters.
Architecture Overview
An AI-powered monitoring pipeline has four stages:
- Collection: Fetch pages through rotating proxies to avoid blocks
- Diffing: Compare current content against the last snapshot
- Analysis: Use an LLM to classify whether the change is meaningful
- Alerting: Notify via Slack, email, or webhook when action is needed
Step 1: Reliable Collection with Proxies
import requestsfrom bs4 import BeautifulSoupimport hashlibimport jsonfrom datetime import datetime
def fetch_page(url, country="us"): """Fetch a page through ZentisLabs proxy with geo-targeting.""" proxy = f"http://USER:PASS_country-{country}@gate.zentislabs.com:7777" proxies = {"http": proxy, "https": proxy} headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/122.0.0.0"} r = requests.get(url, proxies=proxies, headers=headers, timeout=15) r.raise_for_status() soup = BeautifulSoup(r.text, "html.parser") # Remove dynamic elements (ads, timestamps) for tag in soup.select("script, style, iframe, .ad-banner, .timestamp"): tag.decompose() return { "url": url, "text": soup.get_text(separator=" ", strip=True), "html": str(soup), "hash": hashlib.sha256(soup.get_text().encode()).hexdigest(), "fetched_at": datetime.utcnow().isoformat(), }Step 2: Smart Diffing
import difflib
def compute_diff(old_text, new_text): """Compute a human-readable diff between two page snapshots.""" old_lines = old_text.splitlines() new_lines = new_text.splitlines() diff = difflib.unified_diff(old_lines, new_lines, lineterm="") changes = "\n".join(diff) # Calculate change percentage matcher = difflib.SequenceMatcher(None, old_text, new_text) similarity = matcher.ratio() change_pct = round((1 - similarity) * 100, 1) return {"diff": changes, "change_pct": change_pct, "has_changes": change_pct > 0.5}Step 3: LLM Analysis
import openai
def analyze_change(url, diff_text, change_pct): """Use an LLM to determine if the change is meaningful.""" client = openai.OpenAI() response = client.chat.completions.create( model="gpt-4o", messages=[ {"role": "system", "content": "You analyze website changes. Classify each change as: CRITICAL (pricing, legal, product changes), NOTABLE (new content, layout changes), or NOISE (timestamp, ad rotation, minor formatting). Respond with JSON: {category, summary, action_needed}."}, {"role": "user", "content": f"URL: {url}\nChange: {change_pct}%\n\nDiff:\n{diff_text[:3000]}"} ], response_format={"type": "json_object"}, ) return json.loads(response.choices[0].message.content)Step 4: Smart Alerting
import requests as req
def send_alert(analysis, url): """Send alert to Slack if change is meaningful.""" if analysis["category"] == "NOISE": return # Skip noise color = "#ff0000" if analysis["category"] == "CRITICAL" else "#ffaa00" payload = { "attachments": [{ "color": color, "title": f"{analysis['category']}: {url}", "text": analysis["summary"], "fields": [ {"title": "Action Needed", "value": analysis.get("action_needed", "Review"), "short": True}, ], }] } req.post("https://hooks.slack.com/services/YOUR/WEBHOOK/URL", json=payload)Putting It Together
# monitor.py - Run on a schedule (cron, Airflow, n8n)import jsonfrom pathlib import Path
def monitor_url(url, country="us"): snapshot_file = Path(f"snapshots/{hashlib.md5(url.encode()).hexdigest()}.json") # Fetch current page current = fetch_page(url, country) # Load previous snapshot if snapshot_file.exists(): previous = json.loads(snapshot_file.read_text()) diff = compute_diff(previous["text"], current["text"]) if diff["has_changes"]: analysis = analyze_change(url, diff["diff"], diff["change_pct"]) send_alert(analysis, url) print(f"[{analysis['category']}] {url}: {analysis['summary']}") # Save current snapshot snapshot_file.parent.mkdir(exist_ok=True) snapshot_file.write_text(json.dumps(current))
# Monitor competitor pricing pagesurls = [ "https://competitor1.com/pricing", "https://competitor2.com/plans", "https://competitor3.com/api/pricing",]for url in urls: monitor_url(url)🤖 ZentisLabs residential proxies ensure your monitoring never gets blocked. Rotate IPs automatically and geo-target to see the same content your customers see in any market.
