Just a very simple script to provide an API to scrape the book’s category from DangDang and publish it with Heroku.


The following code is quite ugly but I tend not to update it, just put it here for recording.

import requests
from bs4 import BeautifulSoup
from flask import Flask, Response, jsonify
import os

class JSONResponse(Response):
    def force_type(cls, response, environ=None):
        if isinstance(response, (list, dict, bool)):
            response = jsonify(response)
        return super(Response, cls).force_type(response, environ)

app = Flask(__name__)
app.response_class = JSONResponse
app.config['JSON_AS_ASCII'] = False

    'Connection': 'keep-alive',
    'Cache-Control': 'max-age=0',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36',
    'DNT': '1',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',

SEARCH_URL = '{isbn}'
requests_session = requests.session()

def get_class(isbn):
    # Search the isbn first
    r = requests_session.get(SEARCH_URL.format(isbn=isbn))
    soup = BeautifulSoup(r.text, 'lxml')

    # Assuming always find at least one and the first is what we want
    item = soup.select_one('a[name="itemlist-title"]')
    if not item:
        return None

    # Find the product page and extract the book category 
    item_url = item.get('href')
    r = requests_session.get(item_url)
    soup = BeautifulSoup(r.text, 'lxml')
    return [c.get_text() for c in soup.select_one('#detail-category-path > label').next_sibling()]

if __name__ == '__main__':'', port=os.environ.get("PORT", 6500))


Build the docker image with following Dockerfile and command docker build -t${APP_NAME}/web:latest, push to heroku with docker push${APP_NAME}/web:latest and then release with heroku container:release web -a ${APP_NAME}.

FROM vaeum/alpine-python3-pip3

RUN apk add --update --no-cache g++ gcc libxslt-dev && pip3 install requests flask beautifulsoup4 lxml


CMD ["python3", ""]

BTW, Google books also provide an API to check the books’ details.