feat: python worker (bot, scraper, notifier, scheduler)
This commit is contained in:
@@ -0,0 +1,96 @@
|
||||
import asyncio
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
from urllib.parse import quote_plus
|
||||
|
||||
import httpx
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_API_URL = (
|
||||
"https://www.willhaben.at/webapi/ad-search/search/atz/seo/"
|
||||
"kaufen-und-verkaufen/marktplatz"
|
||||
)
|
||||
|
||||
_HEADERS = {
|
||||
"accept": "application/json",
|
||||
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
|
||||
"x-wh-client": "api@willhaben.at;responsive_web;server;1.0.0;desktop",
|
||||
}
|
||||
|
||||
|
||||
async def fetch_ads(keyword: str) -> tuple[list[dict[str, Any]], int]:
|
||||
params = {
|
||||
"keyword": keyword,
|
||||
"rows": 30,
|
||||
"sort": 1,
|
||||
}
|
||||
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
for attempt in range(1, 4):
|
||||
try:
|
||||
resp = await client.get(_API_URL, headers=_HEADERS, params=params)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
break
|
||||
except Exception as exc:
|
||||
logger.warning("fetch_ads attempt %d failed: %s", attempt, exc)
|
||||
if attempt < 3:
|
||||
await asyncio.sleep(2 ** attempt)
|
||||
continue
|
||||
raise
|
||||
|
||||
ads_raw = data.get("advertSummaryList", {}).get("advertSummary", [])
|
||||
total_hits = int(data.get("rowsFound", 0))
|
||||
return ads_raw, total_hits
|
||||
|
||||
|
||||
def _parse_attributes(ad_dict: dict[str, Any]) -> dict[str, str]:
|
||||
attr_list = ad_dict.get("attributes", {}).get("attribute", [])
|
||||
result: dict[str, str] = {}
|
||||
for attr in attr_list:
|
||||
name = attr.get("name")
|
||||
values = attr.get("values", [])
|
||||
if name and values:
|
||||
result[name] = values[0]
|
||||
return result
|
||||
|
||||
|
||||
def extract_ad_fields(ad_dict: dict[str, Any]) -> dict[str, Any]:
|
||||
attrs = _parse_attributes(ad_dict)
|
||||
|
||||
title = attrs.get("HEADING") or ad_dict.get("description", "")
|
||||
|
||||
# Price from "PRICE/AMOUNT" attribute (API format)
|
||||
price_raw = attrs.get("PRICE/AMOUNT")
|
||||
price: float | None = None
|
||||
if price_raw is not None:
|
||||
try:
|
||||
price = float(str(price_raw).replace(",", ""))
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
location = attrs.get("LOCATION")
|
||||
|
||||
seo_url = attrs.get("SEO_URL")
|
||||
url = f"https://www.willhaben.at/iad/{seo_url}" if seo_url else None
|
||||
|
||||
# Published time from CHANGED_String or PUBLISHED_String (ISO 8601)
|
||||
published_raw = attrs.get("PUBLISHED_String") or attrs.get("CHANGED_String")
|
||||
published_at: str | None = None
|
||||
if published_raw:
|
||||
try:
|
||||
dt = datetime.fromisoformat(published_raw.replace("Z", "+00:00"))
|
||||
published_at = dt.isoformat()
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
return {
|
||||
"wh_ad_id": str(ad_dict.get("id", "")),
|
||||
"title": title or "",
|
||||
"price": price,
|
||||
"location": location,
|
||||
"url": url,
|
||||
"published_at": published_at,
|
||||
}
|
||||
Reference in New Issue
Block a user