Fix decitre extraction

This commit is contained in:
Gabriel Augendre 2019-06-01 21:05:03 +02:00
parent 28686d8893
commit 20369de568

View file

@ -1,20 +1,18 @@
import logging
import re import re
import bs4 import bs4
import requests import requests
from django.contrib import messages from django.contrib import messages
from django.contrib.auth.mixins import PermissionRequiredMixin
from django.http import HttpResponseRedirect, JsonResponse from django.http import HttpResponseRedirect, JsonResponse
from django.shortcuts import get_object_or_404, redirect from django.shortcuts import redirect
from django.urls import reverse from django.urls import reverse
from django.views.decorators.cache import cache_page from django.views.decorators.cache import cache_page
from django.views.generic import CreateView, ListView, UpdateView, DeleteView, FormView, DetailView, TemplateView from django.views.generic import CreateView, UpdateView, DeleteView, TemplateView
from manuels.forms import AddBookForm, AddSuppliesForm, EditBookForm, EditSuppliesForm from manuels.forms import AddBookForm, AddSuppliesForm, EditBookForm, EditSuppliesForm
from manuels.models import Teacher, Book, SuppliesRequirement from manuels.models import Teacher, Book, SuppliesRequirement
import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -266,7 +264,7 @@ def validate_isbn(isbn):
# We are able to cache the response because it's very unlikely that the details of a book will change through time # We are able to cache the response because it's very unlikely that the details of a book will change through time
@cache_page(7 * 24 * 60 * 60) # @cache_page(7 * 24 * 60 * 60)
def isbn_api(request, isbn): def isbn_api(request, isbn):
isbn = isbn.strip().replace('-', '') isbn = isbn.strip().replace('-', '')
@ -298,29 +296,32 @@ def isbn_api(request, isbn):
title = title[0] title = title[0]
if title.span: if title.span:
title.span.extract() title.span.extract()
title = title.getText().strip() title = title.get_text(strip=True)
authors = decitre_soup.select('h2.authors') authors = decitre_soup.select('.authors')
if authors: if authors:
authors = authors[0] authors = authors[0]
authors = authors.getText().strip() authors = authors.get_text(strip=True)
price = decitre_soup.select('.product-add-to-cart-wrapper div.price span.final-price') price = decitre_soup.select('div.price span.final-price')
logger.info(f'Found price html {price}')
if price: if price:
price = price[0] price = price[0]
price = price.getText().replace('', '').replace(',', '.').strip() price = price.get_text().replace('', '').replace(',', '.').strip()
logger.info(f'Final price {price}')
year = None year = None
editor = None editor = None
extra_info = decitre_soup.select('ul.extra-infos.hide-on-responsive') extra_info = decitre_soup.select('.informations-container')
if not extra_info:
logger.debug('#fiche-technique')
extra_info = decitre_soup.select('#fiche-technique')
if extra_info: if extra_info:
extra_info = extra_info[0].getText().strip() extra_info = extra_info[0].get_text(strip=True)
matches = re.match('^(?P<editor>.+)\nParu le : \d{2}/\d{2}/(?P<year>\d{4})$', extra_info) matches = re.search(r'Date de parution(?: :)?\d{2}/\d{2}/(?P<year>\d{4})Editeur(?: :)?(?P<editor>[\w ]+)ISBN', extra_info)
groups = matches.groupdict() if matches:
year = groups.get('year') groups = matches.groupdict()
editor = groups.get('editor') year = groups.get('year')
editor = groups.get('editor').strip()
return JsonResponse({ return JsonResponse({
'title': title, 'title': title,