Just a very simple script to provide an API to scrape the book’s category from DangDang and publish it with Heroku .
The API books.py
The following code is quite ugly but I tend not to update it, just put it here for recording.
import requests
from bs4 import BeautifulSoup
from flask import Flask , Response , jsonify
import os
class JSONResponse ( Response ):
@ classmethod
def force_type ( cls , response , environ = None ):
if isinstance ( response , ( list , dict , bool )):
response = jsonify ( response )
return super ( Response , cls ). force_type ( response , environ )
app = Flask ( __name__ )
app . response_class = JSONResponse
app . config [ 'JSON_AS_ASCII' ] = False
HEADERS = {
'Connection' : 'keep-alive' ,
'Cache-Control' : 'max-age=0' ,
'Upgrade-Insecure-Requests' : '1' ,
'User-Agent' : 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36' ,
'DNT' : '1' ,
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' ,
'Accept-Encoding' : 'gzip, deflate' ,
'Accept-Language' : 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7' ,
}
SEARCH_URL = 'http://search.dangdang.com/?medium=01&key4={isbn}'
requests_session = requests . session ()
@ app . route ( '/api/<isbn>' )
def get_class ( isbn ):
# Search the isbn first
r = requests_session . get ( SEARCH_URL . format ( isbn = isbn ))
soup = BeautifulSoup ( r . text , 'lxml' )
# Assuming always find at least one and the first is what we want
item = soup . select_one ( 'a[name="itemlist-title"]' )
if not item :
return None
# Find the product page and extract the book category
item_url = item . get ( 'href' )
r = requests_session . get ( item_url )
soup = BeautifulSoup ( r . text , 'lxml' )
return [ c . get_text () for c in soup . select_one ( '#detail-category-path > label' ). next_sibling ()]
if __name__ == '__main__' :
app . run ( host = '0.0.0.0' , port = os . environ . get ( "PORT" , 6500 ))
Heroku
Build the docker image with following Dockerfile
and command docker build -t registry.heroku.com/${APP_NAME}/web:latest
, push to heroku with docker push registry.heroku.com/${APP_NAME}/web:latest
and then release with heroku container:release web -a ${APP_NAME}
.
FROM vaeum/alpine-python3-pip3
RUN apk add --update --no-cache g++ gcc libxslt-dev && pip3 install requests flask beautifulsoup4 lxml
COPY books.py ./
CMD ["python3", "books.py"]
BTW, Google books also provide an API to check the books’ details.