【已解决】爬取mp.codeup.cn中的英语教材电子书资源

折腾：

【记录】爬虫爬数据义务教育教科书义教教科书

期间，继续去看看另外2本电子书：

电子样书点拨八年级英语下

电子样书点拔训练八年级英语下

去chrome中打开，调试看看。

这个是封面：

之后是其他几张图片：

最开始的几张图片，都预先加载了。

去搜索这些值怎么出来的。

搜封面图片的：

6956499_4689C78AA4AF40C6DFC8ACA2AABAB849

找到：

Request URL:
https://biz.bookln.cn/ebookpageservices/queryAllPageByEbookId.do

返回的json，太长，此处拷贝出来，再去格式化后是：

摘录其中一部分：

{
  "data": {
    "data": [{
      "ebookId": 52365,
      "gmtCreate": 1582279296000,
      "gmtModified": 1582279296000,
      "id": 2578316,
      "imgurl": "https://yuntisyscdn.bookln.cn/server/ebook/pdf/bookln/52365/6956499_4689C78AA4AF40C6DFC8ACA2AABAB849.png",
      "isDelete": 0,
      "pageNo": 1,
      "status": 1,
      "userId": 6956499,
      "userName": "荣德基教育:马强"
    }, {
      "ebookId": 52365,
      "gmtCreate": 1582279296000,
      "gmtModified": 1582279296000,
      "id": 2578317,
      "imgurl": "https://yuntisyscdn.bookln.cn/server/ebook/pdf/bookln/52365/6956499_079BF84EAA676ED226D5B60E57B3B6BC.png",
      "isDelete": 0,
      "pageNo": 2,
      "status": 1,
      "userId": 6956499,
      "userName": "荣德基教育:马强"
    }, {
      "ebookId": 52365,
      "gmtCreate": 1582279296000,
      "gmtModified": 1582279296000,
      "id": 2578318,
      "imgurl": "https://yuntisyscdn.bookln.cn/server/ebook/pdf/bookln/52365/6956499_75158EE94FCFEB6AA46FB6E326D4EA1C.png",
      "isDelete": 0,
      "pageNo": 3,
      "status": 1,
      "userId": 6956499,
      "userName": "荣德基教育:马强"
    },
...
{
      "ebookId": 52365,
      "gmtCreate": 1582279297000,
      "gmtModified": 1582279297000,
      "id": 2578614,
      "imgurl": "https://yuntisyscdn.bookln.cn/server/ebook/pdf/bookln/52365/6956499_4D7D4F4BBBE81A0EE2A808D42CD676AF.png",
      "isDelete": 0,
      "pageNo": 299,
      "status": 1,
      "userId": 6956499,
      "userName": "荣德基教育:马强"
    }],
    "ebookConf": {
      "ebookId": 52365,
      "gmtCreate": xxx68000,
      "gmtModified": xxx68000,
      "id": 32364,
      "pageVoice": 1
    },
    "onlineStatus": 1,
    "description": "2020春 初中点拨 八年级英语（R版）",
    "thumbnails": "http://cdn11.bookln.cn/6956499_BFC33BDB77F6643454E86B8318EA281E.jpeg",
    "bookName": "2020春 初中点拨 八年级英语（R版）",
    "userId": 6956499
  },
  "success": true
}

所以，去获取json，解析后，再挨个下载图片，同时保存图片名为pageNo的值

【未解决】模拟mp.codeup.cn中调用queryAllPageByEbookId.do返回json数据

暂时没把js代码转python。

所以只能是：

直接把Chrome调试得到json去处理和下载

# Function:
#   电子样书 点拨 八年级英语下
#   http://mp.codeup.cn/book/sample2.htm?id=52365&shelfId=4824&share_=6765370&sh=sh&vt_=1583111113754&_logined=1
#
#   电子样书 点拔训练 八年级英语下
#   http://mp.codeup.cn/book/sample2.htm?id=52489&shelfId=4822&share_=6765370&sh=sh&vt_=1583111131475
#   的图片
# Author: Crifan Li
# Update: 20200303


import os
import json
# import copy
import codecs
import requests


gBookIdList = [
  "52365",
  "52489",
]


UserAgent_Mac_Chrome = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36"


gHeaders = {
  "User-Agent": UserAgent_Mac_Chrome,
  "origin": "http://mp.codeup.cn",
}


gSaveFolder = os.path.join("output", "mp.codeup.cn")
gInputFolder = os.path.join("input", "mp.codeup.cn", "queryAllPageByEbookId_resp")


def createFolder(folderFullPath):
    """
        create folder, even if already existed
        Note: for Python 3.2+
    """
    os.makedirs(folderFullPath, exist_ok=True)


createFolder(gSaveFolder)


# for eachBookId in gBookIdList:
#     getAllPageUrl = "https://biz.bookln.cn/ebookpageservices/queryAllPageByEbookId.do"
#     curHeaders = copy.deepcopy(gHeaders)
#     curHeaders["Content-Type"] = "application/x-www-form-urlencoded"
#     curHeaders["Accept"] = "application/json, text/javascript, */*; q=0.01"
#     curHeaders["referer"] = "http://mp.codeup.cn/book/sample2.htm?id=%s" % eachBookId
#     curHeaders["sec-fetch-dest"] = "empty"
#     curHeaders["sec-fetch-mode"] = "cors"
#     curHeaders["sec-fetch-site"] = "cross-site"
#     postDict = {
#       "ebookId": eachBookId
#     }
#     resp = requests.post(getAllPageUrl, headers=curHeaders, data=postDict)
#     print("resp=%s" % resp)


def loadJsonFromFile(fullFilename, fileEncoding="utf-8"):
    """load and parse json dict from file"""
    with codecs.open(fullFilename, 'r', encoding=fileEncoding) as jsonFp:
        jsonDict = json.load(jsonFp)
        # logging.debug("Complete load json from %s", fullFilename)
        return jsonDict


for eachBookId in gBookIdList:
  print("%s bookId=%s %s" % ('-'*30, eachBookId, '-'*30))
  curOutputFolder = os.path.join(gSaveFolder, eachBookId)
  createFolder(curOutputFolder)


  curJsonFile = "%s.json" % eachBookId
  curJsonFullPath = os.path.join(gInputFolder, curJsonFile)
  curBookJsonDict = loadJsonFromFile(curJsonFullPath)


  dataDict = curBookJsonDict["data"]
  bookName = dataDict["bookName"]
  ebookConf = dataDict["ebookConf"]


  pageDictList = dataDict["data"]


  for eachPageDict in pageDictList:
    """
      {
        "ebookId": 52365,
        "gmtCreate": 1582279297000,
        "gmtModified": 1582279297000,
        "id": 2578614,
        "imgurl": "https://yuntisyscdn.bookln.cn/server/ebook/pdf/bookln/52365/6956499_4D7D4F4BBBE81A0EE2A808D42CD676AF.png",
        "isDelete": 0,
        "pageNo": 299,
        "status": 1,
        "userId": 6956499,
        "userName": "荣德基教育:马强"
      }
    """
    ebookId = eachPageDict["ebookId"]
    imgurl = eachPageDict["imgurl"]
    print("imgurl=%s" % imgurl)
    imgSuffix = imgurl.split(".")[-1]
    pageNo = eachPageDict["pageNo"]
    saveFilename = "%s_%03d.%s" % (ebookId, pageNo, imgSuffix)
    saveFullPath = os.path.join(curOutputFolder, saveFilename)
    if os.path.exists(saveFullPath):
      print("existed: %s" % saveFullPath)
    else:
      resp = requests.get(imgurl, headers=gHeaders)
      if resp.ok:
        with open(saveFullPath, 'wb') as saveFp:
          saveFp.write(resp.content)
          print("  Saved to %s" % saveFullPath)

即可下载到：

...
imgurl=https://yuntisyscdn.bookln.cn/server/ebook/pdf/bookln/52489/6956499_AFD4EC4CD78BAA02A1179DEC7F9BEC27.png
existed: output/mp.codeup.cn/52489/52489_181.png
imgurl=https://yuntisyscdn.bookln.cn/server/ebook/pdf/bookln/52489/6956499_40DE1350C9DA75781C7F9E694A55FE15.png
existed: output/mp.codeup.cn/52489/52489_182.png

一堆图片：

转载请注明：在路上 » 【已解决】爬取mp.codeup.cn中的英语教材电子书资源

Post Views: 1,453

与本文相关的文章