Fix AH scraping

This commit is contained in:
polyfloyd 2024-12-22 18:03:41 +01:00
parent 35b0446447
commit 063ac3a4fd
4 changed files with 37 additions and 25 deletions

View file

@ -67,8 +67,8 @@ class NoAutoUpdate(Exception):
def find_product_details(product: Product): def find_product_details(product: Product):
if (ah_sku := product.metadata.get('ah', None)): if 'ah' in product.metadata:
return scrapers.ah_get_by_sku(ah_sku, int(product.metadata['qty'])) return scrapers.ah_get_by_gtin(product.aliases[0])
if 'sligro' in product.metadata: if 'sligro' in product.metadata:
return scrapers.sligro_get_by_gtin(product.aliases[0]) return scrapers.sligro_get_by_gtin(product.aliases[0])
raise NoAutoUpdate() raise NoAutoUpdate()

View file

@ -1,3 +1,4 @@
from dataclasses import dataclass
from decimal import Decimal from decimal import Decimal
from functools import reduce from functools import reduce
from pyquery import PyQuery as pq from pyquery import PyQuery as pq
@ -7,47 +8,47 @@ import os
import requests import requests
import subprocess import subprocess
import logging import logging
from supermarktconnector.ah import AHConnector
from typing import List
vat = Decimal('1.09') vat = Decimal('1.09')
@dataclass
class Product: class Product:
def __init__(self, *, name, price, gtin, units, aliases=[]): name: str
self.name = name price: Decimal
self.price = price gtin: str
self.gtin = gtin units: int
self.units = units aliases: List[str]
self.aliases = aliases
def __str__(self): def __str__(self):
return self.name return self.name
def links_get(url): _ah = None
compl = subprocess.run(['links', '-source', url], capture_output=True)
return compl.stdout
def ah_get_by_gtin(gtin13):
assert re.match(r'^\d{13}$', gtin13)
def ah_get_by_sku(ah_sku, units): global _ah
assert re.match(r'^wi\d+$', ah_sku) if not _ah:
_ah = AHConnector()
html_src = links_get(f'https://www.ah.nl/producten/product/{ah_sku}') ah_prod = _ah.get_product_by_barcode(gtin13)
doc = pq(html_src)
ld_jsons = doc('script[type="application/ld+json"]') units_description = ah_prod['salesUnitSize']
for j in ld_jsons: units = 1
schema = json.loads(j.text) if (m := re.search(r'^(\d+)', units_description)):
if schema['@type'] == 'Product' and schema['sku'] == ah_sku: units = int(m[1])
break
else:
raise Exception(f'ah.nl returned no JSON metadata for SKU {ah_sku}')
return Product( return Product(
name=schema['name'], name=ah_prod['title'],
price=Decimal(schema['offers']['price']), price=Decimal(ah_prod['priceBeforeBonus']),
gtin=schema['gtin13'], gtin=gtin13,
units=units, units=units,
aliases=[],
) )

View file

@ -1,6 +1,16 @@
from scrapers import * from scrapers import *
def test_scrape_ah():
# Ola Liuk
prod = ah_get_by_gtin('8711327538481')
assert type(prod) is Product
assert prod.name == 'Ola Liuk'
assert prod.gtin == '8711327538481'
assert prod.units == 8
assert prod.aliases == []
def test_scrape_sligro(): def test_scrape_sligro():
# Cola zero sugar # Cola zero sugar
prod = sligro_get_by_gtin('5000112659184') prod = sligro_get_by_gtin('5000112659184')

View file

@ -1,3 +1,4 @@
pyquery pyquery
pytest pytest
requests requests
supermarktconnector