Just a very simple script to provide an API to scrape the book’s category from DangDang and publish it with Heroku.
The API books.py
The following code is quite ugly but I tend not to update it, just put it here for recording.
import requests
from bs4 import BeautifulSoup
from flask import Flask, Response, jsonify
import os
class JSONResponse(Response):
@classmethod
def force_type(cls, response, environ=None):
if isinstance(response, (list, dict, bool)):
response = jsonify(response)
return super(Response, cls).force_type(response, environ)
app = Flask(__name__)
app.response_class = JSONResponse
app.config['JSON_AS_ASCII'] = False
HEADERS = {
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36',
'DNT': '1',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
}
SEARCH_URL = 'http://search.dangdang.com/?medium=01&key4={isbn}'
requests_session = requests.session()
@app.route('/api/<isbn>')
def get_class(isbn):
# Search the isbn first
r = requests_session.get(SEARCH_URL.format(isbn=isbn))
soup = BeautifulSoup(r.text, 'lxml')
# Assuming always find at least one and the first is what we want
item = soup.select_one('a[name="itemlist-title"]')
if not item:
return None
# Find the product page and extract the book category
item_url = item.get('href')
r = requests_session.get(item_url)
soup = BeautifulSoup(r.text, 'lxml')
return [c.get_text() for c in soup.select_one('#detail-category-path > label').next_sibling()]
if __name__ == '__main__':
app.run(host='0.0.0.0', port=os.environ.get("PORT", 6500))
Heroku
Build the docker image with following Dockerfile
and command docker build -t registry.heroku.com/${APP_NAME}/web:latest
, push to heroku with docker push registry.heroku.com/${APP_NAME}/web:latest
and then release with heroku container:release web -a ${APP_NAME}
.
FROM vaeum/alpine-python3-pip3
RUN apk add --update --no-cache g++ gcc libxslt-dev && pip3 install requests flask beautifulsoup4 lxml
COPY books.py ./
CMD ["python3", "books.py"]
BTW, Google books also provide an API to check the books’ details.