1:"$Sreact.fragment" 2:I[22016,["/_next/static/chunks/0sqf3kwsxhw92.js","/_next/static/chunks/15vvi4du_kj4d.js","/_next/static/chunks/0t2xr05rlu96l.js","/_next/static/chunks/0j_00-43ohwi..js","/_next/static/chunks/074m5~1.spxnd.js","/_next/static/chunks/03pwh54kk_crp.js"],""] 8:I[6966,["/_next/static/chunks/0sqf3kwsxhw92.js","/_next/static/chunks/15vvi4du_kj4d.js","/_next/static/chunks/0t2xr05rlu96l.js","/_next/static/chunks/0j_00-43ohwi..js","/_next/static/chunks/074m5~1.spxnd.js","/_next/static/chunks/03pwh54kk_crp.js"],"BlogPostContent"] a:I[97367,["/_next/static/chunks/0sqf3kwsxhw92.js","/_next/static/chunks/15vvi4du_kj4d.js","/_next/static/chunks/0t2xr05rlu96l.js","/_next/static/chunks/0j_00-43ohwi..js","/_next/static/chunks/074m5~1.spxnd.js"],"OutletBoundary"] b:"$Sreact.suspense" 0:{"rsc":["$","$1","c",{"children":[["$","div",null,{"className":"min-h-screen bg-background text-foreground","children":[["$","section",null,{"className":"pt-28 pb-16 md:pt-36 md:pb-24 bg-gradient-to-b from-accent/30 to-background","children":["$","div",null,{"className":"container px-4 md:px-6","children":["$","div",null,{"className":"max-w-4xl mx-auto","children":[["$","$L2",null,{"href":"/blog","children":[["$","svg",null,{"xmlns":"http://www.w3.org/2000/svg","width":24,"height":24,"viewBox":"0 0 24 24","fill":"none","stroke":"currentColor","strokeWidth":2,"strokeLinecap":"round","strokeLinejoin":"round","className":"lucide lucide-arrow-left mr-2 h-4 w-4","children":[["$","path","1l729n",{"d":"m12 19-7-7 7-7"}],["$","path","x3x0zl",{"d":"M19 12H5"}],"$undefined"]}],"Back to Blog"],"className":"inline-flex items-center justify-center gap-2 whitespace-nowrap rounded-md text-sm font-medium ring-offset-background transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:pointer-events-none disabled:opacity-50 [&_svg]:pointer-events-none [&_svg]:size-4 [&_svg]:shrink-0 hover:bg-accent hover:text-accent-foreground h-10 px-4 py-2 mb-6","ref":null}],["$","div",null,{"className":"inline-flex items-center rounded-full border px-2.5 py-0.5 text-xs font-semibold transition-colors focus:outline-none focus:ring-2 focus:ring-ring focus:ring-offset-2 border-transparent bg-primary text-primary-foreground hover:bg-primary/80 mb-4","children":"Web Scraping"}],["$","h1",null,{"className":"text-3xl md:text-4xl lg:text-5xl font-bold tracking-tighter mb-6 animate-fade-in","children":"Building a Production-Ready Web Scraper: Architecture and Design Patterns"}],["$","div",null,{"className":"flex flex-wrap items-center gap-4 text-muted-foreground mb-8 animate-fade-in","children":[["$","div",null,{"className":"flex items-center gap-2","children":[["$","svg",null,{"xmlns":"http://www.w3.org/2000/svg","width":24,"height":24,"viewBox":"0 0 24 24","fill":"none","stroke":"currentColor","strokeWidth":2,"strokeLinecap":"round","strokeLinejoin":"round","className":"lucide lucide-calendar h-4 w-4","children":[["$","path","1cmpym",{"d":"M8 2v4"}],["$","path","4m81vk",{"d":"M16 2v4"}],["$","rect","1hopcy",{"width":"18","height":"18","x":"3","y":"4","rx":"2"}],["$","path","8toen8",{"d":"M3 10h18"}],"$undefined"]}],["$","span",null,{"children":"November 15, 2024"}]]}],["$","div",null,{"className":"flex items-center gap-2","children":[["$","svg",null,{"xmlns":"http://www.w3.org/2000/svg","width":24,"height":24,"viewBox":"0 0 24 24","fill":"none","stroke":"currentColor","strokeWidth":2,"strokeLinecap":"round","strokeLinejoin":"round","className":"lucide lucide-clock h-4 w-4","children":[["$","circle","1mglay",{"cx":"12","cy":"12","r":"10"}],["$","polyline","68esgv",{"points":"12 6 12 12 16 14"}],"$undefined"]}],["$","span",null,{"children":"15 min read"}]]}],["$","div",null,{"className":"flex items-center gap-2","children":["$","span",null,{"children":["By ","Muhammad Zaid"]}]}]]}],["$","div",null,{"className":"flex flex-wrap gap-2 mb-8 animate-fade-in","children":[["$","div","Web Scraping",{"className":"inline-flex items-center rounded-full border px-2.5 py-0.5 text-xs font-semibold transition-colors focus:outline-none focus:ring-2 focus:ring-ring focus:ring-offset-2 text-foreground","children":"Web Scraping"}],["$","div","Architecture",{"className":"inline-flex items-center rounded-full border px-2.5 py-0.5 text-xs font-semibold transition-colors focus:outline-none focus:ring-2 focus:ring-ring focus:ring-offset-2 text-foreground","children":"Architecture"}],["$","div","Python",{"className":"inline-flex items-center rounded-full border px-2.5 py-0.5 text-xs font-semibold transition-colors focus:outline-none focus:ring-2 focus:ring-ring focus:ring-offset-2 text-foreground","children":"Python"}],["$","div","Design Patterns",{"className":"inline-flex items-center rounded-full border px-2.5 py-0.5 text-xs font-semibold transition-colors focus:outline-none focus:ring-2 focus:ring-ring focus:ring-offset-2 text-foreground","children":"Design Patterns"}]]}]]}]}]}],"$L3","$L4","$L5"]}],["$L6"],"$L7"]}],"isPartial":false,"staleTime":300,"varyParams":null,"buildId":"QX83e4YaSJMU9KhrDXtKJ"} 3:["$","section",null,{"className":"pb-12","children":["$","div",null,{"className":"container px-4 md:px-6","children":["$","div",null,{"className":"max-w-4xl mx-auto","children":["$","img",null,{"src":"https://images.unsplash.com/photo-1487058792275-0ad4aaf24ca7?auto=format&fit=crop&w=800","alt":"Building a Production-Ready Web Scraper: Architecture and Design Patterns","className":"w-full h-auto rounded-lg shadow-xl","loading":"lazy"}]}]}]}] 9:T1ff3,# Building a Production-Ready Web Scraper: Architecture and Design Patterns As a **web scraping expert** who's built enterprise-level scrapers handling millions of pages, I'll share the architectural patterns that ensure reliability, scalability, and maintainability. ## The Components of a Production Scraper A production-ready scraper needs: 1. **Scheduler**: Manages scraping tasks 2. **Fetcher**: Downloads pages 3. **Parser**: Extracts data 4. **Storage**: Saves results 5. **Monitor**: Tracks performance ## Architecture Overview ```python class ScraperArchitecture: def __init__(self): self.scheduler = Scheduler() self.fetcher = Fetcher() self.parser = Parser() self.storage = Storage() self.monitor = Monitor() ``` ## The Scheduler Component Manages what to scrape and when: ```python from queue import PriorityQueue from dataclasses import dataclass from datetime import datetime @dataclass class Task: url: str priority: int retry_count: int = 0 class Scheduler: def __init__(self): self.queue = PriorityQueue() self.visited = set() def add_task(self, task: Task): if task.url not in self.visited: self.queue.put((task.priority, task)) def get_next_task(self): if not self.queue.empty(): priority, task = self.queue.get() self.visited.add(task.url) return task return None ``` ## The Fetcher Component Handles HTTP requests with retries: ```python import requests from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry class Fetcher: def __init__(self): self.session = self._create_session() def _create_session(self): session = requests.Session() retry = Retry( total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504] ) adapter = HTTPAdapter(max_retries=retry) session.mount('http://', adapter) session.mount('https://', adapter) return session def fetch(self, url, **kwargs): try: response = self.session.get(url, timeout=30, **kwargs) response.raise_for_status() return response except requests.RequestException as e: logger.error(f"Error fetching {url}: {e}") return None ``` ## The Parser Component Extracts and validates data: ```python from bs4 import BeautifulSoup from typing import Dict, Optional class Parser: def parse_product(self, html: str) -> Optional[Dict]: soup = BeautifulSoup(html, 'lxml') try: product = { 'title': self._extract_title(soup), 'price': self._extract_price(soup), 'description': self._extract_description(soup), 'images': self._extract_images(soup) } if self._validate_product(product): return product except Exception as e: logger.error(f"Parse error: {e}") return None def _validate_product(self, product: Dict) -> bool: required_fields = ['title', 'price'] return all(product.get(field) for field in required_fields) ``` ## Rate Limiting Respect target servers: ```python import time from collections import deque class RateLimiter: def __init__(self, max_requests: int, time_window: int): self.max_requests = max_requests self.time_window = time_window self.requests = deque() def wait_if_needed(self): now = time.time() # Remove old requests while self.requests and self.requests[0] < now - self.time_window: self.requests.popleft() # Wait if limit reached if len(self.requests) >= self.max_requests: sleep_time = self.time_window - (now - self.requests[0]) if sleep_time > 0: time.sleep(sleep_time) self.requests.append(time.time()) ``` ## Data Storage Efficient storage with deduplication: ```python from sqlalchemy import create_engine, Column, String, Float, DateTime from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import sessionmaker import hashlib Base = declarative_base() class Product(Base): __tablename__ = 'products' id = Column(String, primary_key=True) title = Column(String) price = Column(Float) url = Column(String, unique=True) scraped_at = Column(DateTime) class Storage: def __init__(self, db_url: str): self.engine = create_engine(db_url) Base.metadata.create_all(self.engine) Session = sessionmaker(bind=self.engine) self.session = Session() def save_product(self, data: Dict): # Create unique ID product_id = hashlib.md5( data['url'].encode() ).hexdigest() product = Product( id=product_id, **data, scraped_at=datetime.now() ) self.session.merge(product) self.session.commit() ``` ## Monitoring and Alerts Track scraper health: ```python from dataclasses import dataclass from typing import Dict @dataclass class Metrics: total_requests: int = 0 successful_requests: int = 0 failed_requests: int = 0 items_scraped: int = 0 class Monitor: def __init__(self): self.metrics = Metrics() def record_request(self, success: bool): self.metrics.total_requests += 1 if success: self.metrics.successful_requests += 1 else: self.metrics.failed_requests += 1 def record_item(self): self.metrics.items_scraped += 1 def get_success_rate(self) -> float: if self.metrics.total_requests == 0: return 0 return self.metrics.successful_requests / self.metrics.total_requests ``` ## Putting It All Together ```python class ProductionScraper: def __init__(self): self.scheduler = Scheduler() self.fetcher = Fetcher() self.parser = Parser() self.storage = Storage('postgresql://...') self.monitor = Monitor() self.rate_limiter = RateLimiter(max_requests=10, time_window=60) def run(self, urls: list): # Add initial tasks for url in urls: self.scheduler.add_task(Task(url=url, priority=1)) # Process tasks while task := self.scheduler.get_next_task(): self.rate_limiter.wait_if_needed() response = self.fetcher.fetch(task.url) self.monitor.record_request(response is not None) if response: product = self.parser.parse_product(response.text) if product: self.storage.save_product(product) self.monitor.record_item() # Report results print(f"Success rate: {self.monitor.get_success_rate():.2%}") print(f"Items scraped: {self.monitor.metrics.items_scraped}") ``` ## Deployment Considerations ### Docker Deployment ```dockerfile FROM python:3.9-slim WORKDIR /app COPY requirements.txt . RUN pip install -r requirements.txt COPY . . CMD ["python", "scraper.py"] ``` ### Kubernetes for Scale ```yaml apiVersion: batch/v1 kind: CronJob metadata: name: web-scraper spec: schedule: "0 */6 * * *" jobTemplate: spec: template: spec: containers: - name: scraper image: scraper:latest ``` ## Conclusion Building production-ready **web scrapers** requires careful architecture and attention to detail. These patterns have helped me build scrapers that run reliably for years. Need help with your **web scraping** project? As a **data scraping expert** and **freelance Python developer**, I can help you build scalable, reliable scraping solutions!4:["$","section",null,{"className":"pb-16","children":["$","div",null,{"className":"container px-4 md:px-6","children":["$","div",null,{"className":"max-w-4xl mx-auto","children":["$","article",null,{"className":"prose prose-lg dark:prose-invert max-w-none prose-pre:p-0 prose-pre:bg-transparent prose-pre:border-0","children":["$","$L8",null,{"content":"$9"}]}]}]}]}] 5:["$","section",null,{"className":"section bg-accent/30","children":["$","div",null,{"className":"container px-4 md:px-6","children":["$","div",null,{"className":"max-w-3xl mx-auto text-center space-y-6","children":[["$","h2",null,{"className":"text-3xl md:text-4xl font-bold tracking-tighter","children":["Need Expert ",["$","span",null,{"className":"text-primary","children":"Python Development"}],"?"]}],["$","p",null,{"className":"text-xl text-muted-foreground","children":["Looking to ",["$","strong",null,{"children":"hire Python developer"}]," or need help with ",["$","strong",null,{"children":"Django"}],", ",["$","strong",null,{"children":"web scraping"}],", or ",["$","strong",null,{"children":"automation"}],"projects? Let's work together!"]}],["$","div",null,{"className":"flex flex-col sm:flex-row gap-4 justify-center","children":[["$","$L2",null,{"href":"/contact","children":["Get In Touch ",["$","svg",null,{"xmlns":"http://www.w3.org/2000/svg","width":24,"height":24,"viewBox":"0 0 24 24","fill":"none","stroke":"currentColor","strokeWidth":2,"strokeLinecap":"round","strokeLinejoin":"round","className":"lucide lucide-arrow-right ml-2 h-5 w-5","children":[["$","path","1ays0h",{"d":"M5 12h14"}],["$","path","xquz4c",{"d":"m12 5 7 7-7 7"}],"$undefined"]}]],"className":"inline-flex items-center justify-center gap-2 whitespace-nowrap text-sm font-medium ring-offset-background transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:pointer-events-none disabled:opacity-50 [&_svg]:pointer-events-none [&_svg]:size-4 [&_svg]:shrink-0 bg-primary text-primary-foreground hover:bg-primary/90 h-11 px-8 rounded-full","ref":null}],["$","$L2",null,{"href":"/blog","children":"View All Posts","className":"inline-flex items-center justify-center gap-2 whitespace-nowrap text-sm font-medium ring-offset-background transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:pointer-events-none disabled:opacity-50 [&_svg]:pointer-events-none [&_svg]:size-4 [&_svg]:shrink-0 border border-input bg-background hover:bg-accent hover:text-accent-foreground h-11 px-8 rounded-full","ref":null}]]}]]}]}]}] 6:["$","script","script-0",{"src":"/_next/static/chunks/03pwh54kk_crp.js","async":true}] 7:["$","$La",null,{"children":["$","$b",null,{"name":"Next.MetadataOutlet","children":"$@c"}]}] c:null