折腾:
期间,已经大概分析了绘本数据的相关api。
现在需要去写代码模拟下载数据。
先确保第一个获取绘本列表的api能够正常获取数据。
期间:
继续写代码。
结果:
接着就可以写代码去爬取数据了。
接着又遇到:
【已解决】PySpider模拟小花生app请求parentChildReadingBookQuery2返回空数据
以及:
【已解决】小花生app中调用接口parentChildReadingBookQuery2时timestamp和signature生成的逻辑
然后接着去模拟剩下的selfReadingBookQuery2
然后继续参考:
【已解决】用Charles+Postman+Python解密脚本分析小花生app中绘本接口和返回信息
实现剩余的api请求。
然后基本上写好了代码,且也优化好了:
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2019-03-27 15:35:20
# Project: XiaohuashengApp
from pyspider.libs.base_handler import *
import os
import json
import codecs
import base64
import gzip
import copy
import time
import re
# import datetime
from datetime import datetime, timedelta
from hashlib import md5
######################################################################
# Const
######################################################################
gServerPort = "http://www.xiaohuasheng.cn:83"
gResourcesRoot = "https://img.xiaohuasheng.cn"
SelfReadingUrl = "http://www.xiaohuasheng.cn:83/Reading.svc/selfReadingBookQuery2"
ParentChildReadingUrl = "http://www.xiaohuasheng.cn:83/Reading.svc/parentChildReadingBookQuery2"
# ViewEnglishSeries2UrlPrefix = "http://www.xiaohuasheng.cn:83/Reading.svc/viewEnglishSeries2"
RESPONSE_OK = "1001"
######################################################################
# Config & Settings
######################################################################
OutputFolder = "/Users/crifan/dev/dev_root/company/xxx/projects/crawler_projects/crawler_xiaohuasheng_app/output"
DefaultPageSize = 10
gUserAgentNoxAndroid = "Mozilla/5.0 (Linux; U; Android 4.4.2; zh-cn; A0001 Build/KOT49H) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"
gUserId = "1134723"
gAuthorization = """NSTp9~)NwSfrXp@\\"""
gUserToken = "40d2267f-359e-4526-951a-66519e5868c3"
gSecretKey = “AyGt7ohMR!xx#N"
gHeaders = {
"Host": "www.xiaohuasheng.cn:83",
"User-Agent": gUserAgentNoxAndroid,
"Content-Type": "application/json",
"userId": gUserId,
"Authorization": gAuthorization,
# "timestamp": gTimestamp,
# "signature": gSignature,
"cookie": "ASP.NET_SessionId=dxf3obxgn5t4w350xp3icgy0",
# "Cookie2": "$Version=1",
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate",
"cache-control": "no-cache",
"Connection": "keep-alive",
# "content-length": "202",
}
gParamLevelAll = -1
gFixParam1 = 1
gLongitude = "120.136174"
gLatitude = "28.997280"
gFixParam2 = 10
######################################################################
# Common Util Functions
######################################################################
def getCurTimestamp(withMilliseconds=False):
"""
get current time's timestamp
(default)not milliseconds -> 10 digits: 1351670162
with milliseconds -> 13 digits: 1531464292921
"""
curDatetime = datetime.now()
return datetimeToTimestamp(curDatetime, withMilliseconds)
def datetimeToTimestamp(datetimeVal, withMilliseconds=False) :
"""
convert datetime value to timestamp
eg:
"2006-06-01 00:00:00.123" -> 1149091200
if with milliseconds -> 1149091200123
:param datetimeVal:
:return:
"""
timetupleValue = datetimeVal.timetuple()
timestampFloat = time.mktime(timetupleValue) # 1531468736.0 -> 10 digits
timestamp10DigitInt = int(timestampFloat) # 1531468736
timestampInt = timestamp10DigitInt
if withMilliseconds:
microsecondInt = datetimeVal.microsecond # 817762
microsecondFloat = float(microsecondInt)/float(1000000) # 0.817762
timestampFloat = timestampFloat + microsecondFloat # 1531468736.817762
timestampFloat = timestampFloat * 1000 # 1531468736817.7621 -> 13 digits
timestamp13DigitInt = int(timestampFloat) # 1531468736817
timestampInt = timestamp13DigitInt
return timestampInt
def extractSuffix(fileNameOrUrl):
"""
extract file suffix from name or url
eg:
https://cdn2.xxx.cn/2018-09-10/15365514898246.mp4 -> mp4
15365514894833.srt -> srt
"""
return fileNameOrUrl.split('.')[-1]
def createFolder(folderFullPath):
"""
create folder, even if already existed
Note: for Python 3.2+
"""
os.makedirs(folderFullPath, exist_ok=True)
print("Created folder: %s" % folderFullPath)
def saveDataToFile(fullFilename, binaryData):
"""save binary data info file"""
with open(fullFilename, 'wb') as fp:
fp.write(binaryData)
fp.close()
print("Complete save file %s" % fullFilename)
def saveJsonToFile(fullFilename, jsonValue):
"""save json dict into file"""
with codecs.open(fullFilename, 'w', encoding="utf-8") as jsonFp:
json.dump(jsonValue, jsonFp, indent=2, ensure_ascii=False)
print("Complete save json %s" % fullFilename)
def loadJsonFromFile(fullFilename):
"""load and parse json dict from file"""
with codecs.open(fullFilename, 'r', encoding="utf-8") as jsonFp:
jsonDict = json.load(jsonFp)
print("Complete load json from %s" % fullFilename)
return jsonDict
######################################################################
# Project Specific Functions
######################################################################
def getSeriesFolder(seriesId):
return os.path.abspath(os.path.join(OutputFolder, "series", str(seriesId)))
def getSeriesAudioPackagesFolder(seriesId):
return os.path.abspath(os.path.join(getSeriesFolder(seriesId), "AudioPackages"))
def getSeriesBooksFolder(seriesId):
return os.path.abspath(os.path.join(getSeriesFolder(seriesId), "Books"))
def getSingleAudioPackageFolder(seriesId, audioPackageId):
return os.path.abspath(os.path.join(getSeriesAudioPackagesFolder(seriesId), str(audioPackageId)))
def getSingleAudioFolder(seriesId, audioPackageId, audioId):
return os.path.abspath(os.path.join(getSingleAudioPackageFolder(seriesId, audioPackageId), str(audioId)))
def getSingleBookFolder(seriesId, bookId):
return os.path.abspath(os.path.join(getSeriesBooksFolder(seriesId), str(bookId)))
######################################################################
# Main
######################################################################
class Handler(BaseHandler):
crawl_config = {
}
#----------------------------------------
# Util Functions
#----------------------------------------
def downloadFileCallback(self, response):
fileInfo = response.save
print("fileInfo=%s" % fileInfo)
binData = response.content
fileFullPath = os.path.join(fileInfo["saveFolder"], fileInfo["filename"])
print("fileFullPath=%s" % fileFullPath)
saveDataToFile(fileFullPath, binData)
def downloadFile(self, fileInfo):
urlToDownload = fileInfo["fileUrl"]
print("urlToDownload=%s" % urlToDownload)
self.crawl(urlToDownload,
callback=self.downloadFileCallback,
save=fileInfo)
def generateSignature(self, timestampInt, jValueOrUrlEndpoint):
# print("generateSignature: timestampInt=%d, jValueOrUrlEndpoint=%s" % (timestampInt, jValueOrUrlEndpoint))
# userId = "1134723"
userId = gUserId
timestamp = "%s" % timestampInt
# localObject = "/Reading.svc/parentChildReadingBookQuery2"
# localObject = jValueOrUrlEndpoint
# userToken = "40d2267f-359e-4526-951a-66519e5868c3"
userToken = gUserToken
# fixedSault = “AyGt7ohMR!xx#N"
# secretKey = “AyGt7ohMR!xx#N"
secretKey = gSecretKey
# strToCalc = userId + timestamp + localObject + jValueOrUrlEndpoint + fixedSault
# strToCalc = timestamp + localObject + fixedSault
strToCalc = userId + timestamp + jValueOrUrlEndpoint + userToken + secretKey
# print("strToCalc=%s" % strToCalc)
encodedStr = strToCalc.encode()
# encodedStr = strToCalc.encode("UTF-8")
# print("encodedStr=%s" % encodedStr)
md5Result = md5(encodedStr)
# print("md5Result=%s" % md5Result) # md5Result=<md5 HASH object @ 0x1044f1df0>
# md5Result = md5()
# md5Result.update(strToCalc)
# md5Digest = md5Result.digest()
# print("md5Digest=%s" % md5Digest) #
# print("len(md5Digest)=%s" % len(md5Digest))
md5Hexdigest = md5Result.hexdigest()
# print("md5Hexdigest=%s" % md5Hexdigest)
# print("len(md5Hexdigest)=%s" % len(md5Hexdigest))
# md5Hexdigest=c687d5dfa015246e6bdc6b3c27c2afea
# print("md5=%s from %s" % (md5Hexdigest, strToCalc))
return md5Hexdigest
# return md5Digest
def extractResponseData(self, respJson):
"""
{
"C": 2,
"J": "H4sIAA.......AA=",
"M": "1001",
"ST": null
}
"""
# respJson = json.loads(respJson)
respM = respJson["M"]
if respM != RESPONSE_OK:
return None
encodedStr = respJson["J"]
decodedStr = base64.b64decode(encodedStr)
# print("decodedStr=%s" % decodedStr)
decompressedStr = gzip.decompress(decodedStr)
# print("decompressedStr=%s" % decompressedStr)
decompressedStrUnicode = decompressedStr.decode("UTF-8")
# print("decompressedStrUnicode=%s" % decompressedStrUnicode)
decompressedJson = json.loads(decompressedStrUnicode)
respDataDict = decompressedJson
return respDataDict
def generateCurrentHeaders(self, jValueOrUrlEndpoint):
curHeaders = copy.deepcopy(gHeaders)
curTimestampInt = getCurTimestamp()
curTimestampStr = str(curTimestampInt)
curHeaders["timestamp"] = curTimestampStr
curSignature = self.generateSignature(curTimestampInt, jValueOrUrlEndpoint)
curHeaders["signature"] = curSignature
return curHeaders
def dictValueStrToJson(self, originDict):
"""
auto detect json filed name is xxxJson or xxxArrayJson, then convert json str to dict/json
"""
processedDict = originDict
if isinstance(processedDict, dict):
firstLevelKeys = processedDict.keys()
for eachFieldName in firstLevelKeys:
isArrayJson = re.match(r"\w+ArrayJson$", eachFieldName)
isJson = re.match(r"\w+Json$", eachFieldName)
# print("isArrayJson=%s, isJson=%s" % (isArrayJson, isJson))
if isArrayJson or isJson:
fieldValueJsonStr = processedDict[eachFieldName]
# print("%s -> fieldValueJsonStr=%s" % (eachFieldName, fieldValueJsonStr))
if fieldValueJsonStr:
fieldValueDict = json.loads(fieldValueJsonStr)
else:
fieldValueDict = None
fieldValueDict = self.dictValueStrToJson(fieldValueDict)
processedDict[eachFieldName] = fieldValueDict
elif isinstance(originDict, list):
newList = []
for eachItem in originDict:
processedItem = self.dictValueStrToJson(eachItem)
newList.append(processedItem)
processedDict = newList
return processedDict
#----------------------------------------
# Crawl Logic
#----------------------------------------
def on_start(self):
jValueTemplateSelfReading = "{\"userId\":\"%s\",\"fieldName\":\"\",\"fieldValue\":\"全部类别\",\"grades\":\"\",\"levels\":\"\",\"supportingResources\":\"有音频\",\"offset\":%d,\"limit\":%d}"
jValueTemplateParentChildReading = "{\"userId\":\"%s\",\"fieldName\":\"\",\"fieldValue\":\"全部类别\",\"theStageOfTheChild\":\"\",\"parentalEnglishLevel\":\"\",\"supportingResources\":\"有音频\",\"offset\":%d,\"limit\":%d}"
paramDictSelfReading = {
"curUrl": SelfReadingUrl,
"offset": 0,
"limit": DefaultPageSize,
"jValueTemplate": jValueTemplateSelfReading
}
self.getBookQuery2(paramDictSelfReading)
paramDictParentChildReading = {
"curUrl": ParentChildReadingUrl,
"offset": 0,
"limit": DefaultPageSize,
"jValueTemplate": jValueTemplateParentChildReading
}
self.getBookQuery2(paramDictParentChildReading)
def getBookQuery2(self, curParamDict):
print("getBookQuery2: curParamDict=%s" % curParamDict)
curUrl = curParamDict["curUrl"]
jValueTemplate = curParamDict["jValueTemplate"]
offset = curParamDict["offset"]
limit = curParamDict["limit"]
jValueStr = jValueTemplate % (gUserId, offset, limit)
jcJsonDict = {
"J": jValueStr,
"C": 0
}
jcJsonDictStr = json.dumps(jcJsonDict)
curParamDict["jValueStr"] = jValueStr
curParamDict["jcJsonDict"] = jcJsonDict
curParamDict["jcJsonDictStr"] = jcJsonDictStr
curHeaders = self.generateCurrentHeaders(jValueStr)
# add hash value for url to force re-crawl when POST url not changed
timestampStr = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
curUrlWithHash = curUrl + "#" + timestampStr
fakeItagForceRecrawl = "%s_%s_%s" % (timestampStr, offset, limit)
self.crawl(curUrlWithHash,
itag=fakeItagForceRecrawl, # To force re-crawl for next page
method="POST",
# data=jcJsonDict,
data= jcJsonDictStr,
# callback=curCallback,
callback=self.getBookQuery2Callback,
headers=curHeaders,
save=curParamDict
)
def getBookQuery2Callback(self, response):
respUrl = response.url
print("respUrl=%s" % respUrl)
prevParaDict = response.save
print("prevParaDict=%s" % prevParaDict)
respJson = response.json
print("respJson=%s" % respJson)
respData = self.extractResponseData(respJson)
print("respData=%s" % respData)
if respData:
newOffset = prevParaDict["offset"] + prevParaDict["limit"]
prevParaDict["offset"] = newOffset
self.getBookQuery2(prevParaDict)
bookSeriesList = respData
for eachBookSerie in bookSeriesList:
print("eachBookSerie=%s" % eachBookSerie)
self.getStorybookDetail(eachBookSerie)
else:
print("!!! %s return no more data: %s" % (response.url, respJson))
def getStorybookDetail(self, bookSerieDict):
print("getStorybookDetail: bookSerieDict=%s" % bookSerieDict)
seriePrimayKey = bookSerieDict["pk"]
urlEndpoint = "/Reading.svc/viewEnglishSeries2/%s/%s" % (gUserId, seriePrimayKey)
fullUrl = "%s%s" % (gServerPort, urlEndpoint)
# http://www.xiaohuasheng.cn:83/Reading.svc/viewEnglishSeries2/1134723/31
print("urlEndpoint=%s, fullUrl=%s" % (urlEndpoint, fullUrl))
curHeaders = self.generateCurrentHeaders(urlEndpoint)
self.crawl(fullUrl,
method="GET",
callback=self.getSerieDetailCallback,
headers=curHeaders,
# save=bookSerieDict
)
def getSerieDetailCallback(self, response):
respUrl = response.url
print("respUrl=%s" % respUrl)
# bookSerieDict = response.save
# print("bookSerieDict=%s" % bookSerieDict)
respJson = response.json
print("respJson=%s" % respJson)
respData = self.extractResponseData(respJson)
print("respData=%s" % respData)
respDict = respData[0]
# respDict["url"] = response.url
# return respDict
bookSeriesDict = respDict
seriesId = bookSeriesDict["pk"]
self.saveSeriesInfo(bookSeriesDict)
# get audio
audioPackagesParamDict = {
"seriesId": seriesId,
"level": gParamLevelAll,
"fixParam1": gFixParam1,
"offset": 0,
"limit": DefaultPageSize
}
self.getSeriesAudioPackages(audioPackagesParamDict)
# get book info
bookParamDict = {
"seriesId": seriesId,
"level": gParamLevelAll,
"offset": 0,
"limit": DefaultPageSize
}
self.getSeriesBook(bookParamDict)
def getSeriesBook(self, paramDict):
urlEndpoint = "/Reading.svc/queryEnglishSeriesBook/%s/%s/%s/%s/%s" % \
(gUserId, paramDict["seriesId"], paramDict["level"], paramDict["offset"], paramDict["limit"])
print("urlEndpoint=%s" % urlEndpoint)
fullUrl = "%s%s" % (gServerPort, urlEndpoint)
# http://www.xiaohuasheng.cn:83/Reading.svc/queryEnglishSeriesBook/1134723/31/-1/0/10
curHeaders = self.generateCurrentHeaders(urlEndpoint)
self.crawl(fullUrl,
method="GET",
callback=self.getSeriesBookCallback,
headers=curHeaders,
save=paramDict,
)
def getSeriesBookCallback(self, response):
respUrl = response.url
print("respUrl=%s" % respUrl)
respJson = response.json
print("respJson=%s" % respJson)
respData = self.extractResponseData(respJson)
print("respData=%s" % respData)
if respData:
prevParamDict = response.save
curParamDict = prevParamDict
curParamDict["offset"] += curParamDict["limit"]
self.getSeriesBook(curParamDict)
seriesId = curParamDict["seriesId"]
seriesBookList = respData
print("seriesBookList=%s" % seriesBookList)
for eachBookDict in seriesBookList:
print("eachBookDict=%s" % eachBookDict)
curBookId = eachBookDict["pk"]
self.getSingleBookInfo(seriesId, curBookId)
else:
print("!!! %s return no more data: %s" % (response.url, respJson))
def getSingleBookInfo(self, seriesId, curBookId):
urlEndpoint = "/Reading.svc/getServerBookInfo17/%s/%s/%s/%s/%s" % \
(gUserId, gLongitude, gLatitude, curBookId, gFixParam2)
print("urlEndpoint=%s" % urlEndpoint)
fullUrl = "%s%s" % (gServerPort, urlEndpoint)
# http://www.xiaohuasheng.cn:83/Reading.svc/getServerBookInfo17/1134723/120.136174/28.997280/109512/10
curHeaders = self.generateCurrentHeaders(urlEndpoint)
self.crawl(fullUrl,
method="GET",
callback=self.getSingleBookInfoCallback,
headers=curHeaders,
save=seriesId,
)
def getSingleBookInfoCallback(self, response):
seriesId = response.save
print("seriesId=%s" % seriesId)
respUrl = response.url
print("respUrl=%s" % respUrl)
respJson = response.json
print("respJson=%s" % respJson)
respData = self.extractResponseData(respJson)
print("respData=%s" % respData)
bookInfoDict = respData[0]
print("bookInfoDict=%s" % bookInfoDict)
self.saveSingleBookInfo(seriesId, bookInfoDict)
def saveSingleBookInfo(self, seriesId, bookInfoDict):
# curSeriesBooksFolder = getSeriesBooksFolder(seriesId)
# print("curSeriesBooksFolder=%s" % curSeriesBooksFolder)
# createFolder(curSeriesBooksFolder)
bookId = bookInfoDict["pk"]
singleBooksFolder = getSingleBookFolder(seriesId, bookId)
print("singleBooksFolder=%s" % singleBooksFolder)
createFolder(singleBooksFolder)
singleBookFilename = "series_%s_Books_%s_info.json" % (seriesId, bookId)
singleBookFullPath = os.path.abspath(os.path.join(singleBooksFolder, singleBookFilename))
bookInfoDict = self.dictValueStrToJson(bookInfoDict)
saveJsonToFile(singleBookFullPath, bookInfoDict)
# download and save: frontCover
# "frontCover": "149/Book/20160930171033.png",
coverImageUrlTail = bookInfoDict["frontCover"]
if coverImageUrlTail:
coverImageFilename = ("Books_%s_" % bookId) + coverImageUrlTail.replace("/", "_")
imageFileInfo = {
"fileUrl": gResourcesRoot + "/" + coverImageUrlTail,
"filename": coverImageFilename,
"saveFolder": singleBooksFolder,
}
self.downloadFile(imageFileInfo)
def saveSeriesInfo(self, bookSeriesDict):
seriesId = bookSeriesDict["pk"]
curSeriesFolder = getSeriesFolder(seriesId)
print("curSeriesFolder=%s" % curSeriesFolder)
createFolder(curSeriesFolder)
filenamePrefix = "series_%s" % seriesId
seriesFilename = "%s_info.json" % filenamePrefix
seriesFullPath = os.path.abspath(os.path.join(curSeriesFolder, seriesFilename))
bookSeriesDict = self.dictValueStrToJson(bookSeriesDict)
saveJsonToFile(seriesFullPath, bookSeriesDict)
# download series cover image
"""
/series/623/series_623_info.json
{
"pk": 623,
"englishTitle": "Peppa Pig",
"chineseTitle": "小猪佩奇绘本集",
"picture": "System/EnglishSeriesPicture/20190114112209525.jpg",
...
/series/158/series_158_info.json
{
"pk": 158,
"englishTitle": "An Elephant and Piggie Book",
"chineseTitle": "小猪小象绘本系列",
...
"picture": "",
"lessonPlanFirstPictureUrl": "https://img.xiaohuasheng.cn/20180911145347266_80f5f443a43bb430663a71b381cde40e.jpg",
"""
fileUrl = None
coverImageUrlTail = bookSeriesDict["picture"]
lessonPlanFirstPictureUrl = bookSeriesDict["lessonPlanFirstPictureUrl"]
if coverImageUrlTail:
coverImageFilename = filenamePrefix + coverImageUrlTail.replace("/", "_")
fileUrl = gResourcesRoot + "/" + coverImageUrlTail
elif lessonPlanFirstPictureUrl:
coverImageFilename = filenamePrefix + "_" + lessonPlanFirstPictureUrl.split("/")[-1]
fileUrl = lessonPlanFirstPictureUrl
if fileUrl:
imageFileInfo = {
"fileUrl": fileUrl,
"filename": coverImageFilename,
"saveFolder": curSeriesFolder,
}
self.downloadFile(imageFileInfo)
def getSeriesAudioPackages(self, paramDict):
urlEndpoint = "/Reading.svc/queryEnglishSeriesAudio/%s/%s/%s/%s/%s/%s" % \
(gUserId, paramDict["seriesId"], paramDict["level"], paramDict["fixParam1"], paramDict["offset"], paramDict["limit"])
print("urlEndpoint=%s" % urlEndpoint)
fullUrl = "%s%s" % (gServerPort, urlEndpoint)
# http://www.xiaohuasheng.cn:83/Reading.svc/queryEnglishSeriesAudio/1134723/31/-1/1/0/10
"""
http://www.xiaohuasheng.cn:83/Reading.svc/getLevelForQueryEnglishSeriesAudio/1134723/31
return english series level:
[
{
"pk": -1,
"name": "全部"
},
{
"pk": 79,
"name": "Level 1"
},
{
"pk": 80,
"name": "Level 2"
},
{
"pk": 81,
"name": "Level 3"
}
]
"""
curHeaders = self.generateCurrentHeaders(urlEndpoint)
self.crawl(fullUrl,
method="GET",
callback=self.getSeriesAudioPackagesCallback,
headers=curHeaders,
save=paramDict,
)
def saveSeriesAudioPackagesInfo(self, seriesAudioPackagesInfo):
print("saveSeriesAudioPackagesInfo: seriesAudioPackagesInfo=%s" % seriesAudioPackagesInfo)
seriesId = seriesAudioPackagesInfo["seriesId"]
curAudioPackagesFolder = getSeriesAudioPackagesFolder(seriesId)
print("curAudioPackagesFolder=%s" % curAudioPackagesFolder)
if not os.path.exists(curAudioPackagesFolder):
createFolder(curAudioPackagesFolder)
audioPackagesFilename = "series_%s_AudioPackages_info.json" % seriesId
print("audioPackagesFilename=%s" % audioPackagesFilename)
audioPackagesFullPath = os.path.abspath(os.path.join(curAudioPackagesFolder, audioPackagesFilename))
if os.path.exists(audioPackagesFullPath):
print("alreay existed %s" % audioPackagesFullPath)
# append
prevAudioPackagesInfo = loadJsonFromFile(audioPackagesFullPath)
prevSeriesId = prevAudioPackagesInfo["seriesId"]
if prevSeriesId != seriesId:
print("!!! Unexpected not same id for saving series audio info, old=%s, new=%s" % (prevSeriesId, seriesId))
else:
newAudioPackagesInfo = prevAudioPackagesInfo
newAudioPackagesInfo["AudioPackages"].extend(seriesAudioPackagesInfo["AudioPackages"])
saveJsonToFile(audioPackagesFullPath, newAudioPackagesInfo)
else:
print("not existed %s" % audioPackagesFullPath)
# write
saveJsonToFile(audioPackagesFullPath, seriesAudioPackagesInfo)
def getSeriesAudioPackagesCallback(self, response):
respUrl = response.url
print("respUrl=%s" % respUrl)
respJson = response.json
print("respJson=%s" % respJson)
respData = self.extractResponseData(respJson)
print("respData=%s" % respData)
if respData:
prevParamDict = response.save
curParamDict = prevParamDict
curParamDict["offset"] += curParamDict["limit"]
self.getSeriesAudioPackages(curParamDict)
seriesAudioPackagesList = respData
seriesId = curParamDict["seriesId"]
seriesAudioPackagesInfo = {
"seriesId": seriesId,
"AudioPackages": seriesAudioPackagesList
}
self.saveSeriesAudioPackagesInfo(seriesAudioPackagesInfo)
print("seriesAudioPackagesList=%s" % seriesAudioPackagesList)
for eachAudioPackageDict in seriesAudioPackagesList:
print("eachAudioPackageDict=%s" % eachAudioPackageDict)
audioPackageId = eachAudioPackageDict["pk"]
self.getAudioPackage(seriesId, audioPackageId)
else:
print("!!! %s return no more data: %s" % (response.url, respJson))
def getAudioPackage(self, seriesId, audioPackageId):
urlEndpoint = "/Reading.svc/viewAudioPackage/%s/%s/%s" % (gUserId, audioPackageId, gFixParam1)
fullUrl = "%s%s" % (gServerPort, urlEndpoint)
# http://www.xiaohuasheng.cn:83/Reading.svc/viewAudioPackage/1134723/1808/1
print("urlEndpoint=%s, fullUrl=%s" % (urlEndpoint, fullUrl))
curHeaders = self.generateCurrentHeaders(urlEndpoint)
self.crawl(fullUrl,
method="GET",
callback=self.getAudioPackageCallback,
headers=curHeaders,
save=seriesId
)
def getAudioPackageCallback(self, response):
seriesId = response.save
print("seriesId=%s" % seriesId)
respUrl = response.url
print("respUrl=%s" % respUrl)
respJson = response.json
print("respJson=%s" % respJson)
respData = self.extractResponseData(respJson)
print("respData=%s" % respData)
audioPackageDict = respData[0]
print("audioPackageDict=%s" % audioPackageDict)
self.saveSingleAudioPackageInfo(seriesId, audioPackageDict)
audioArrayJsonStr = audioPackageDict["audioArrayJson"]
print("audioArrayJsonStr=%s" % audioArrayJsonStr)
audioPackageId = audioPackageDict["pk"]
# audioArrayDictList = json.loads(audioArrayJsonStr)
audioArrayDictList = audioArrayJsonStr
print("audioArrayDictList=%s" % audioArrayDictList)
for singleAudioDict in audioArrayDictList:
print("singleAudioDict=%s" % singleAudioDict)
singleAudioDict["seriesId"] = seriesId
singleAudioDict["audioPackageId"] = audioPackageId
self.saveSingleAudio(singleAudioDict)
def saveSingleAudioPackageInfo(self, seriesId, audioPackageInfo):
audioPackageId = audioPackageInfo["pk"]
curSingleAudioPackageFolder = getSingleAudioPackageFolder(seriesId, audioPackageId)
print("curSingleAudioPackageFolder=%s" % curSingleAudioPackageFolder)
createFolder(curSingleAudioPackageFolder)
filenamePrefix = "series_%s_AudioPackages_%s" % (seriesId, audioPackageId)
singleAudioPackageFilename = "%s_info.json" % (filenamePrefix)
singleAudioPackageFullPath = os.path.abspath(os.path.join(curSingleAudioPackageFolder, singleAudioPackageFilename))
audioPackageInfo = self.dictValueStrToJson(audioPackageInfo)
saveJsonToFile(singleAudioPackageFullPath, audioPackageInfo)
# download bookSeriesPicture
# case 1:
# "bookSeriesPicture": "EnglishLevelFrontCoverOrInnerPage/79/封面.jpg",
# coverImageUrlTail = audioPackageInfo["bookSeriesPicture"]
# case 2:
# "picture": "attached/image/20190114/20190114103636_2075.jpg",
# "bookSeriesPicture": "",
coverImageUrlTail = audioPackageInfo["picture"]
print("coverImageUrlTail=%s" % coverImageUrlTail)
if coverImageUrlTail:
imageSuffix = coverImageUrlTail.split(".")[-1]
imageFileInfo = {
"fileUrl": gResourcesRoot + "/" + coverImageUrlTail,
"filename": "%s_coverImage.%s" % (filenamePrefix, imageSuffix),
"saveFolder": curSingleAudioPackageFolder,
}
self.downloadFile(imageFileInfo)
def saveSingleAudio(self, singleAudioDict):
seriesId = singleAudioDict["seriesId"]
audioPackageId = singleAudioDict["audioPackageId"]
audioId = singleAudioDict["pk"]
curSingleAudioFolder = getSingleAudioFolder(seriesId, audioPackageId, audioId)
print("curSingleAudioFolder=%s" % curSingleAudioFolder)
createFolder(curSingleAudioFolder)
filenamePrefix = "series_%s_AudioPackages_%s_audio_%s" % (seriesId, audioPackageId, audioId)
singleAudioFilename = "%s_info.json" % (filenamePrefix)
singleAudioFullPath = os.path.abspath(os.path.join(curSingleAudioFolder, singleAudioFilename))
saveJsonToFile(singleAudioFullPath, singleAudioDict)
"""
{
"pk": 6497,
"picture": "EnglishLevelFrontCoverOrInnerPage/79/封面.jpg",
"path": "Audio/1808/20180911222508831.mp3",
"extension": ".mp3",
"title": "1. Bear Hugs-Listen and Repeat",
"size": 1735488,
"duration": 433,
"sizeString": "1.7M",
"durationString": "07:13",
"packageName": "Bear Hugs ",
"seriesId": 31,
"audioPackageId": 1808
}
"""
# download audio file
# "path": "Audio/1808/20180911222516379.mp3",
audioFileUrlTail = singleAudioDict["path"]
print("audioFileUrlTail=%s" % audioFileUrlTail)
if audioFileUrlTail:
audioFileInfo = {
"fileUrl": gResourcesRoot + "/" + audioFileUrlTail,
"filename": ("Aduios_%s_" % audioId) + audioFileUrlTail.replace("/", "_"),
"saveFolder": curSingleAudioFolder,
}
self.downloadFile(audioFileInfo)
本地调试时,可以下载到需要的各种文件。
然后去批量运行,结果报错:
【已解决】PySpider运行批量下载时报错:HTTP 599 Operation timed out after milliseconds with out of bytes received