Fix AH scraping
This commit is contained in:
parent
35b0446447
commit
063ac3a4fd
4 changed files with 37 additions and 25 deletions
|
@ -67,8 +67,8 @@ class NoAutoUpdate(Exception):
|
||||||
|
|
||||||
|
|
||||||
def find_product_details(product: Product):
|
def find_product_details(product: Product):
|
||||||
if (ah_sku := product.metadata.get('ah', None)):
|
if 'ah' in product.metadata:
|
||||||
return scrapers.ah_get_by_sku(ah_sku, int(product.metadata['qty']))
|
return scrapers.ah_get_by_gtin(product.aliases[0])
|
||||||
if 'sligro' in product.metadata:
|
if 'sligro' in product.metadata:
|
||||||
return scrapers.sligro_get_by_gtin(product.aliases[0])
|
return scrapers.sligro_get_by_gtin(product.aliases[0])
|
||||||
raise NoAutoUpdate()
|
raise NoAutoUpdate()
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
from dataclasses import dataclass
|
||||||
from decimal import Decimal
|
from decimal import Decimal
|
||||||
from functools import reduce
|
from functools import reduce
|
||||||
from pyquery import PyQuery as pq
|
from pyquery import PyQuery as pq
|
||||||
|
@ -7,47 +8,47 @@ import os
|
||||||
import requests
|
import requests
|
||||||
import subprocess
|
import subprocess
|
||||||
import logging
|
import logging
|
||||||
|
from supermarktconnector.ah import AHConnector
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
|
||||||
vat = Decimal('1.09')
|
vat = Decimal('1.09')
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
class Product:
|
class Product:
|
||||||
def __init__(self, *, name, price, gtin, units, aliases=[]):
|
name: str
|
||||||
self.name = name
|
price: Decimal
|
||||||
self.price = price
|
gtin: str
|
||||||
self.gtin = gtin
|
units: int
|
||||||
self.units = units
|
aliases: List[str]
|
||||||
self.aliases = aliases
|
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.name
|
return self.name
|
||||||
|
|
||||||
|
|
||||||
def links_get(url):
|
_ah = None
|
||||||
compl = subprocess.run(['links', '-source', url], capture_output=True)
|
|
||||||
return compl.stdout
|
|
||||||
|
|
||||||
|
def ah_get_by_gtin(gtin13):
|
||||||
|
assert re.match(r'^\d{13}$', gtin13)
|
||||||
|
|
||||||
def ah_get_by_sku(ah_sku, units):
|
global _ah
|
||||||
assert re.match(r'^wi\d+$', ah_sku)
|
if not _ah:
|
||||||
|
_ah = AHConnector()
|
||||||
|
|
||||||
html_src = links_get(f'https://www.ah.nl/producten/product/{ah_sku}')
|
ah_prod = _ah.get_product_by_barcode(gtin13)
|
||||||
doc = pq(html_src)
|
|
||||||
|
|
||||||
ld_jsons = doc('script[type="application/ld+json"]')
|
units_description = ah_prod['salesUnitSize']
|
||||||
for j in ld_jsons:
|
units = 1
|
||||||
schema = json.loads(j.text)
|
if (m := re.search(r'^(\d+)', units_description)):
|
||||||
if schema['@type'] == 'Product' and schema['sku'] == ah_sku:
|
units = int(m[1])
|
||||||
break
|
|
||||||
else:
|
|
||||||
raise Exception(f'ah.nl returned no JSON metadata for SKU {ah_sku}')
|
|
||||||
|
|
||||||
return Product(
|
return Product(
|
||||||
name=schema['name'],
|
name=ah_prod['title'],
|
||||||
price=Decimal(schema['offers']['price']),
|
price=Decimal(ah_prod['priceBeforeBonus']),
|
||||||
gtin=schema['gtin13'],
|
gtin=gtin13,
|
||||||
units=units,
|
units=units,
|
||||||
|
aliases=[],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,16 @@
|
||||||
from scrapers import *
|
from scrapers import *
|
||||||
|
|
||||||
|
|
||||||
|
def test_scrape_ah():
|
||||||
|
# Ola Liuk
|
||||||
|
prod = ah_get_by_gtin('8711327538481')
|
||||||
|
assert type(prod) is Product
|
||||||
|
assert prod.name == 'Ola Liuk'
|
||||||
|
assert prod.gtin == '8711327538481'
|
||||||
|
assert prod.units == 8
|
||||||
|
assert prod.aliases == []
|
||||||
|
|
||||||
|
|
||||||
def test_scrape_sligro():
|
def test_scrape_sligro():
|
||||||
# Cola zero sugar
|
# Cola zero sugar
|
||||||
prod = sligro_get_by_gtin('5000112659184')
|
prod = sligro_get_by_gtin('5000112659184')
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
pyquery
|
pyquery
|
||||||
pytest
|
pytest
|
||||||
requests
|
requests
|
||||||
|
supermarktconnector
|
||||||
|
|
Loading…
Add table
Reference in a new issue