115 lines
3.5 KiB
Python
115 lines
3.5 KiB
Python
import asyncio
|
|
import logging
|
|
from datetime import datetime, timezone
|
|
from typing import Any
|
|
from urllib.parse import quote_plus
|
|
|
|
import httpx
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_API_URL = (
|
|
"https://www.willhaben.at/webapi/ad-search/search/atz/seo/"
|
|
"kaufen-und-verkaufen/marktplatz"
|
|
)
|
|
|
|
_HEADERS = {
|
|
"accept": "application/json",
|
|
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
|
|
"x-wh-client": "api@willhaben.at;responsive_web;server;1.0.0;desktop",
|
|
}
|
|
|
|
|
|
async def fetch_ads(keyword: str) -> tuple[list[dict[str, Any]], int]:
|
|
params = {
|
|
"keyword": keyword,
|
|
"rows": 30,
|
|
"sort": 1,
|
|
}
|
|
|
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
|
for attempt in range(1, 4):
|
|
try:
|
|
resp = await client.get(_API_URL, headers=_HEADERS, params=params)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
break
|
|
except Exception as exc:
|
|
logger.warning("fetch_ads attempt %d failed: %s", attempt, exc)
|
|
if attempt < 3:
|
|
await asyncio.sleep(2 ** attempt)
|
|
continue
|
|
raise
|
|
|
|
ads_raw = data.get("advertSummaryList", {}).get("advertSummary", [])
|
|
total_hits = int(data.get("rowsFound", 0))
|
|
return ads_raw, total_hits
|
|
|
|
|
|
def _parse_attributes(ad_dict: dict[str, Any]) -> dict[str, str]:
|
|
attr_list = ad_dict.get("attributes", {}).get("attribute", [])
|
|
result: dict[str, str] = {}
|
|
for attr in attr_list:
|
|
name = attr.get("name")
|
|
values = attr.get("values", [])
|
|
if name and values:
|
|
result[name] = values[0]
|
|
return result
|
|
|
|
|
|
def extract_ad_fields(ad_dict: dict[str, Any]) -> dict[str, Any]:
|
|
attrs = _parse_attributes(ad_dict)
|
|
|
|
title = attrs.get("HEADING") or ad_dict.get("description", "")
|
|
|
|
# Price from "PRICE/AMOUNT" attribute (API format)
|
|
price_raw = attrs.get("PRICE/AMOUNT")
|
|
price: float | None = None
|
|
if price_raw is not None:
|
|
try:
|
|
price = float(str(price_raw).replace(",", ""))
|
|
except (ValueError, TypeError):
|
|
pass
|
|
|
|
location = attrs.get("LOCATION")
|
|
|
|
seo_url = attrs.get("SEO_URL")
|
|
url = f"https://www.willhaben.at/iad/{seo_url}" if seo_url else None
|
|
|
|
# Published time from CHANGED_String or PUBLISHED_String (ISO 8601)
|
|
published_raw = attrs.get("PUBLISHED_String") or attrs.get("CHANGED_String")
|
|
published_at: datetime | None = None
|
|
if published_raw:
|
|
try:
|
|
published_at = datetime.fromisoformat(published_raw.replace("Z", "+00:00"))
|
|
except (ValueError, TypeError):
|
|
pass
|
|
|
|
# Main image from the first advertImage entry
|
|
images = ad_dict.get("advertImageList", {}).get("advertImage", [])
|
|
main_image_url: str | None = None
|
|
if images and isinstance(images[0], dict):
|
|
main_image_url = images[0].get("referenceImageUrl")
|
|
|
|
postcode = attrs.get("POSTCODE")
|
|
|
|
modified_raw = attrs.get("CHANGED_String")
|
|
modified_at: datetime | None = None
|
|
if modified_raw:
|
|
try:
|
|
modified_at = datetime.fromisoformat(modified_raw.replace("Z", "+00:00"))
|
|
except (ValueError, TypeError):
|
|
pass
|
|
|
|
return {
|
|
"wh_ad_id": str(ad_dict.get("id", "")),
|
|
"title": title or "",
|
|
"price": price,
|
|
"location": location,
|
|
"url": url,
|
|
"published_at": published_at,
|
|
"main_image_url": main_image_url,
|
|
"postcode": postcode,
|
|
"modified_at": modified_at,
|
|
}
|