文章内容

2018/9/7 16:07:22,作 者: 黄兵

百度文库API免费下载百度文库收费资料(DOC|PPT|TXT|PDF)

百度文库API免费下载百度文库收费资料(DOC|PPT|TXT|PDF)

以下是源代码:

#!/usr/bin/python
# -*- coding:utf-8 -*-

import requests
import re
import argparse
import json
import os

parser = argparse.ArgumentParser()
parser.add_argument('url', help="Target Url,你所需要文档的URL", type=str)
parser.add_argument('type', help="Target Type,你所需要文档的的类型(DOC|PPT|TXT|PDF)", type=str)
args = parser.parse_args()

url = args.url
type = args.type

# 根据文件决定函数
y = 0


def DOC(url):
    doc_id = re.findall('view/(.*).html', url)[0]
    html = requests.get(url).text
    lists = re.findall('(https.*?0.json.*?)\\\\x22}', html)
    lenth = (len(lists) // 2)
    NewLists = lists[:lenth]
    for i in range(len(NewLists)):
        NewLists[i] = NewLists[i].replace('\\', '')
        txts = requests.get(NewLists[i]).text
        txtlists = re.findall('"c":"(.*?)".*?"y":(.*?),', txts)
        for i in range(0, len(txtlists)):
            global y
            print(txtlists[i][0].encode('utf-8').decode('unicode_escape', 'ignore'))
            if y != txtlists[i][1]:
                y = txtlists[i][1]
                n = '\n'
            else:
                n = ''
            filename = doc_id + '.txt'
            with open(filename, 'a', encoding='utf-8') as f:
                f.write(n + txtlists[i][0].encode('utf-8').decode('unicode_escape', 'ignore').replace('\\', ''))
            print("文档保存在" + filename)


def PPT(url):
    doc_id = re.findall('view/(.*).html', url)[0]
    url = "https://wenku.baidu.com/browse/getbcsurl?doc_id=" + doc_id + "&pn=1&rn=99999&type=ppt"
    html = requests.get(url).text
    lists = re.findall('{"zoom":"(.*?)","page"', html)
    for i in range(0, len(lists)):
        lists[i] = lists[i].replace("\\", '')
    try:
        os.mkdir(doc_id)
    except:
        pass
    for i in range(0, len(lists)):
        img = requests.get(lists[i]).content
        with open(doc_id + '\img' + str(i) + '.jpg', 'wb') as m:
            m.write(img)
    print("PPT图片保存在" + doc_id + "文件夹")


def TXT(url):
    doc_id = re.findall('view/(.*).html', url)[0]
    url = "https://wenku.baidu.com/api/doc/getdocinfo?callback=cb&doc_id=" + doc_id
    html = requests.get(url).text
    md5 = re.findall('"md5sum":"(.*?)"', html)[0]
    pn = re.findall('"totalPageNum":"(.*?)"', html)[0]
    rsign = re.findall('"rsign":"(.*?)"', html)[0]
    NewUrl = 'https://wkretype.bdimg.com/retype/text/' + doc_id + '?rn=' + pn + '&type=txt' + md5 + '&rsign=' + rsign
    txt = requests.get(NewUrl).text
    jsons = json.loads(txt)
    texts = re.findall("'c': '(.*?)',", str(jsons))
    print(texts)
    filename = doc_id + '.txt'
    with open(filename, 'a', encoding='utf-8') as f:
        for i in range(0, len(texts)):
            texts[i] = texts[i].replace('\\r', '\r')
            texts[i] = texts[i].replace('\\n', '\n')
            f.write(texts[i])
    print("文档保存在" + filename)


def PDF(url):
    doc_id = re.findall('view/(.*).html', url)[0]
    url = "https://wenku.baidu.com/browse/getbcsurl?doc_id=" + doc_id + "&pn=1&rn=99999&type=ppt"
    html = requests.get(url).text
    lists = re.findall('{"zoom":"(.*?)","page"', html)
    for i in range(0, len(lists)):
        lists[i] = lists[i].replace("\\", '')
        try:
            os.mkdir(doc_id)
        except:
            pass
    for i in range(0, len(lists)):
        img = requests.get(lists[i]).content
    with open(doc_id + '\img' + str(i) + '.jpg', 'wb') as m:
        m.write(img)
    print("FPD图片保存在" + doc_id + "文件夹")


if __name__ == "__main__":
    try:
        print("""###Athor:52pojie ##TIPS:PDF|PPT只能下载图片········""")
        eval(type.upper())(url)
    except:
        print("获取出错,可能URL错误\n使用格式name.exe url type\n请使用--help查看帮助")

但是我认为这个源码并不是很好,主要有一下几个方面:

1、doc文档保存的是txt文件,所有的格式都没有了,如果doc文件中有图片,图片也没有了;

2、PPT、PDF文件保存的是图片,这个无法编辑是一个很头疼的问题。

综上所述,这个源码并不是很好,最近找打了一个比较好的源码,正在调试。

调试好了,做成一个百度文库在线服务,地址会留下来,欢迎大家关注。

分享到:

发表评论

评论列表