From 063ac3a4fd9a74ef296b91552d4d5c923f9cd886 Mon Sep 17 00:00:00 2001 From: polyfloyd Date: Sun, 22 Dec 2024 18:03:41 +0100 Subject: [PATCH] Fix AH scraping --- inflatinator/revbank.py | 4 +-- inflatinator/scrapers.py | 47 ++++++++++++++++++----------------- inflatinator/scrapers_test.py | 10 ++++++++ requirements.txt | 1 + 4 files changed, 37 insertions(+), 25 deletions(-) diff --git a/inflatinator/revbank.py b/inflatinator/revbank.py index 46bb3e3..5ddb534 100644 --- a/inflatinator/revbank.py +++ b/inflatinator/revbank.py @@ -67,8 +67,8 @@ class NoAutoUpdate(Exception): def find_product_details(product: Product): - if (ah_sku := product.metadata.get('ah', None)): - return scrapers.ah_get_by_sku(ah_sku, int(product.metadata['qty'])) + if 'ah' in product.metadata: + return scrapers.ah_get_by_gtin(product.aliases[0]) if 'sligro' in product.metadata: return scrapers.sligro_get_by_gtin(product.aliases[0]) raise NoAutoUpdate() diff --git a/inflatinator/scrapers.py b/inflatinator/scrapers.py index 18609d3..e2b6652 100644 --- a/inflatinator/scrapers.py +++ b/inflatinator/scrapers.py @@ -1,3 +1,4 @@ +from dataclasses import dataclass from decimal import Decimal from functools import reduce from pyquery import PyQuery as pq @@ -7,47 +8,47 @@ import os import requests import subprocess import logging +from supermarktconnector.ah import AHConnector +from typing import List vat = Decimal('1.09') +@dataclass class Product: - def __init__(self, *, name, price, gtin, units, aliases=[]): - self.name = name - self.price = price - self.gtin = gtin - self.units = units - self.aliases = aliases + name: str + price: Decimal + gtin: str + units: int + aliases: List[str] def __str__(self): return self.name -def links_get(url): - compl = subprocess.run(['links', '-source', url], capture_output=True) - return compl.stdout +_ah = None +def ah_get_by_gtin(gtin13): + assert re.match(r'^\d{13}$', gtin13) -def ah_get_by_sku(ah_sku, units): - assert re.match(r'^wi\d+$', ah_sku) + global _ah + if not _ah: + _ah = AHConnector() - html_src = links_get(f'https://www.ah.nl/producten/product/{ah_sku}') - doc = pq(html_src) + ah_prod = _ah.get_product_by_barcode(gtin13) - ld_jsons = doc('script[type="application/ld+json"]') - for j in ld_jsons: - schema = json.loads(j.text) - if schema['@type'] == 'Product' and schema['sku'] == ah_sku: - break - else: - raise Exception(f'ah.nl returned no JSON metadata for SKU {ah_sku}') + units_description = ah_prod['salesUnitSize'] + units = 1 + if (m := re.search(r'^(\d+)', units_description)): + units = int(m[1]) return Product( - name=schema['name'], - price=Decimal(schema['offers']['price']), - gtin=schema['gtin13'], + name=ah_prod['title'], + price=Decimal(ah_prod['priceBeforeBonus']), + gtin=gtin13, units=units, + aliases=[], ) diff --git a/inflatinator/scrapers_test.py b/inflatinator/scrapers_test.py index 0f293f2..02fd1bc 100644 --- a/inflatinator/scrapers_test.py +++ b/inflatinator/scrapers_test.py @@ -1,6 +1,16 @@ from scrapers import * +def test_scrape_ah(): + # Ola Liuk + prod = ah_get_by_gtin('8711327538481') + assert type(prod) is Product + assert prod.name == 'Ola Liuk' + assert prod.gtin == '8711327538481' + assert prod.units == 8 + assert prod.aliases == [] + + def test_scrape_sligro(): # Cola zero sugar prod = sligro_get_by_gtin('5000112659184') diff --git a/requirements.txt b/requirements.txt index a8df116..68df88e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ pyquery pytest requests +supermarktconnector