feat: python worker (bot, scraper, notifier, scheduler)

This commit is contained in:
opencode
2026-06-16 19:12:35 +02:00
committed by Jose Lago
parent d32ce26d9e
commit fbbdc5e54b
7 changed files with 729 additions and 0 deletions
+96
View File
@@ -0,0 +1,96 @@
import asyncio
import logging
from datetime import datetime, timezone
from typing import Any
from urllib.parse import quote_plus
import httpx
logger = logging.getLogger(__name__)
_API_URL = (
"https://www.willhaben.at/webapi/ad-search/search/atz/seo/"
"kaufen-und-verkaufen/marktplatz"
)
_HEADERS = {
"accept": "application/json",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
"x-wh-client": "api@willhaben.at;responsive_web;server;1.0.0;desktop",
}
async def fetch_ads(keyword: str) -> tuple[list[dict[str, Any]], int]:
params = {
"keyword": keyword,
"rows": 30,
"sort": 1,
}
async with httpx.AsyncClient(timeout=30.0) as client:
for attempt in range(1, 4):
try:
resp = await client.get(_API_URL, headers=_HEADERS, params=params)
resp.raise_for_status()
data = resp.json()
break
except Exception as exc:
logger.warning("fetch_ads attempt %d failed: %s", attempt, exc)
if attempt < 3:
await asyncio.sleep(2 ** attempt)
continue
raise
ads_raw = data.get("advertSummaryList", {}).get("advertSummary", [])
total_hits = int(data.get("rowsFound", 0))
return ads_raw, total_hits
def _parse_attributes(ad_dict: dict[str, Any]) -> dict[str, str]:
attr_list = ad_dict.get("attributes", {}).get("attribute", [])
result: dict[str, str] = {}
for attr in attr_list:
name = attr.get("name")
values = attr.get("values", [])
if name and values:
result[name] = values[0]
return result
def extract_ad_fields(ad_dict: dict[str, Any]) -> dict[str, Any]:
attrs = _parse_attributes(ad_dict)
title = attrs.get("HEADING") or ad_dict.get("description", "")
# Price from "PRICE/AMOUNT" attribute (API format)
price_raw = attrs.get("PRICE/AMOUNT")
price: float | None = None
if price_raw is not None:
try:
price = float(str(price_raw).replace(",", ""))
except (ValueError, TypeError):
pass
location = attrs.get("LOCATION")
seo_url = attrs.get("SEO_URL")
url = f"https://www.willhaben.at/iad/{seo_url}" if seo_url else None
# Published time from CHANGED_String or PUBLISHED_String (ISO 8601)
published_raw = attrs.get("PUBLISHED_String") or attrs.get("CHANGED_String")
published_at: str | None = None
if published_raw:
try:
dt = datetime.fromisoformat(published_raw.replace("Z", "+00:00"))
published_at = dt.isoformat()
except (ValueError, TypeError):
pass
return {
"wh_ad_id": str(ad_dict.get("id", "")),
"title": title or "",
"price": price,
"location": location,
"url": url,
"published_at": published_at,
}