diff --git a/inflatinator/scrapers.py b/inflatinator/scrapers.py index 7761ee9..b5f82d3 100644 --- a/inflatinator/scrapers.py +++ b/inflatinator/scrapers.py @@ -103,17 +103,10 @@ def sligro_get_by_gtin(gtin13): product = body['products'][0] sku = product["code"] - # Query the product page itself, there is more info that we need on there. The 'url' field in - # the product object gives a 404, but the actual product page URL can be created from the search - # results. - url_slug = '-'.join([product['brandName'], product['name'], product['contentDescription']])\ - .replace(' ', '-')\ - .replace('\'', '-')\ - .replace('&', '-')\ - .replace(',', '')\ - .replace('%', '')\ - .lower() - prod_resp = requests.get(f'https://www.sligro.nl/p.{sku}.html/{url_slug}.html') + # Query the product page itself, there is more info that we need on there. In the website, the + # final path element is a derivation of the contentDescription field. It must be present, but + # matches anything. + prod_resp = requests.get(f'https://www.sligro.nl/p.{sku}.html/product.html') prod_resp.raise_for_status() product_page = pq(prod_resp.text) @@ -125,8 +118,7 @@ def sligro_get_by_gtin(gtin13): if sub_gtin: sub_gtin = sub_gtin.lstrip('0') - # The contentDescription field holds the number of individual packages per box sold. - units, volume = parse_content_description(product['contentDescription']) + units, volume = get_packaging_info(product) # Pricing requires logging in and is on a separate endpoint... pricing_resp = sligro_client().get(f'https://www.sligro.nl/api/cart/sligro-nl/customerorganizationdatas?productCodes={sku}') @@ -149,6 +141,13 @@ def sligro_get_by_gtin(gtin13): ) +def get_packaging_info(product): + if product['gtin'] == '08712641001903': # Tjendrawasih Bapao kip: no contentDescription field? + return 12, '120 gram' + # The contentDescription field holds the number of individual packages per box sold. + return parse_content_description(product['contentDescription']) + + # The contentDescription seems to have a formatting consistent enough for regex matching. Some # products have multiple levels of packaging, but the last or only component is always the # volume or weight.