import asyncio import logging from datetime import datetime, timezone from typing import Any from urllib.parse import quote_plus import httpx logger = logging.getLogger(__name__) _API_URL = ( "https://www.willhaben.at/webapi/ad-search/search/atz/seo/" "kaufen-und-verkaufen/marktplatz" ) _HEADERS = { "accept": "application/json", "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36", "x-wh-client": "api@willhaben.at;responsive_web;server;1.0.0;desktop", } async def fetch_ads(keyword: str) -> tuple[list[dict[str, Any]], int]: params = { "keyword": keyword, "rows": 30, "sort": 1, } async with httpx.AsyncClient(timeout=30.0) as client: for attempt in range(1, 4): try: resp = await client.get(_API_URL, headers=_HEADERS, params=params) resp.raise_for_status() data = resp.json() break except Exception as exc: logger.warning("fetch_ads attempt %d failed: %s", attempt, exc) if attempt < 3: await asyncio.sleep(2 ** attempt) continue raise ads_raw = data.get("advertSummaryList", {}).get("advertSummary", []) total_hits = int(data.get("rowsFound", 0)) return ads_raw, total_hits def _parse_attributes(ad_dict: dict[str, Any]) -> dict[str, str]: attr_list = ad_dict.get("attributes", {}).get("attribute", []) result: dict[str, str] = {} for attr in attr_list: name = attr.get("name") values = attr.get("values", []) if name and values: result[name] = values[0] return result def extract_ad_fields(ad_dict: dict[str, Any]) -> dict[str, Any]: attrs = _parse_attributes(ad_dict) title = attrs.get("HEADING") or ad_dict.get("description", "") # Price from "PRICE/AMOUNT" attribute (API format) price_raw = attrs.get("PRICE/AMOUNT") price: float | None = None if price_raw is not None: try: price = float(str(price_raw).replace(",", "")) except (ValueError, TypeError): pass location = attrs.get("LOCATION") seo_url = attrs.get("SEO_URL") url = f"https://www.willhaben.at/iad/{seo_url}" if seo_url else None # Published time from CHANGED_String or PUBLISHED_String (ISO 8601) published_raw = attrs.get("PUBLISHED_String") or attrs.get("CHANGED_String") published_at: datetime | None = None if published_raw: try: published_at = datetime.fromisoformat(published_raw.replace("Z", "+00:00")) except (ValueError, TypeError): pass # Main image from the first advertImage entry images = ad_dict.get("advertImageList", {}).get("advertImage", []) main_image_url: str | None = None if images and isinstance(images[0], dict): main_image_url = images[0].get("referenceImageUrl") postcode = attrs.get("POSTCODE") modified_raw = attrs.get("CHANGED_String") modified_at: datetime | None = None if modified_raw: try: modified_at = datetime.fromisoformat(modified_raw.replace("Z", "+00:00")) except (ValueError, TypeError): pass return { "wh_ad_id": str(ad_dict.get("id", "")), "title": title or "", "price": price, "location": location, "url": url, "published_at": published_at, "main_image_url": main_image_url, "postcode": postcode, "modified_at": modified_at, }