From 7a8997835d56f9345095c6dd64aa3e6c6a89a471 Mon Sep 17 00:00:00 2001 From: polyfloyd Date: Mon, 21 Apr 2025 19:37:03 +0200 Subject: [PATCH] sligro: Return substitute products when available --- inflatinator/scrapers.py | 50 ++++++++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 12 deletions(-) diff --git a/inflatinator/scrapers.py b/inflatinator/scrapers.py index b5f82d3..435301e 100644 --- a/inflatinator/scrapers.py +++ b/inflatinator/scrapers.py @@ -8,7 +8,7 @@ import os import requests import logging from supermarktconnector.ah import AHConnector -from typing import List +from typing import List, Optional vat = Decimal('1.09') @@ -26,6 +26,7 @@ class Product: gtin: str units: int aliases: List[str] + replacement: Optional["Product"] = None def __str__(self): return self.name @@ -84,25 +85,46 @@ def sligro_client(): return _sess -def sligro_get_by_gtin(gtin13): +def sligro_get_by_sku(sku, _recurse=0): + assert re.match(r'^\d{4,12}$', sku) + return _sligro_get(sku, _recurse=_recurse) + + +def sligro_get_by_gtin(gtin13, _recurse=0): assert re.match(r'^\d{13}$', gtin13) gtin14 = f'{gtin13:0>14}' - # The search feature of the website returns results in JSON and handles GTIN formats. Neat! # However, it can be a bit picky about leading zeros, so we try to query with GTIN14 as that is # what works in the most cases. Sometimes GTIN13 is still required though for gtin_whatever in [gtin14, gtin13]: - response = requests.get(f'https://www.sligro.nl/api/product-overview/sligro-nl/nl/query/3?term={gtin_whatever}') - response.raise_for_status() - body = response.json() - if 'products' in body: - break - else: + try: + return _sligro_get(gtin_whatever, _recurse=_recurse) + except ProductNotFoundError: + continue + raise ProductNotFoundError() + + +def _sligro_get(query, *, _recurse=0): + # A runaway recursion could DoS the sligro API, which is impolite :) + assert _recurse <= 1 + + response = requests.get(f'https://www.sligro.nl/api/product-overview/sligro-nl/nl/query/3?term={query}') + response.raise_for_status() + body = response.json() + if 'products' not in body: raise ProductNotFoundError() - product = body['products'][0] + if len(body['products']) > 1: + product = next(filter(lambda p: 'productReferenceReplace' not in p, body['products'])) + else: + product = body['products'][0] + sku = product["code"] + replacement = None + if 'productReferenceReplace' in product: + replacement = sligro_get_by_sku(product['productReferenceReplace'][0], _recurse=_recurse+1) + # Query the product page itself, there is more info that we need on there. In the website, the # final path element is a derivation of the contentDescription field. It must be present, but # matches anything. @@ -132,12 +154,16 @@ def sligro_get_by_gtin(gtin13): else: price_obj = pricing['price'] + name = product["name"] + name = re.sub(' - Wordt binnenkort vervangen door.+$', '', name) + return Product( - name=f'{product["brandName"]} {product["name"]} ({volume})', + name=f'{product["brandName"]} {name} ({volume})', price=Decimal(price_obj['value']) * vat, - gtin=gtin13, + gtin=product['gtin'].lstrip('0'), units=units, aliases=[sub_gtin] if sub_gtin else [], + replacement=replacement, )