訂閱
糾錯(cuò)
加入自媒體

文獻(xiàn)批量下載器PyCNKi使用教程

PyCNKi下載器使用教程

PyCNKi下載器源碼

(百度鏈接里有.ipynb格式源碼)

一、導(dǎo)入庫

from selenium import webdriverfrom selenium.webdriver import ChromeOptionsfrom selenium.webdriver.chrome.options import Optionsimport openpyxlimport reimport timefrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support.select import Selectimport urllib.error

二、打開知網(wǎng)并進(jìn)行初始設(shè)置

#無可視化界面操作def wu_visual():    chrome_options = Options()    chrome_options.a(chǎn)dd_argument('--h(huán)eadless')    chrome_options.a(chǎn)dd_argument('--disable-gpu')    return chrome_options
def fan_jiance():    option = ChromeOptions()    option.a(chǎn)dd_experimental_option('excludeSwitches', ['enable-automation'])    #option.a(chǎn)dd_argument('-kiosk')    return optiondef url_error_test(url,bro):    try:        bro.get(url)        print("OK")    except urllib.error.HTTPError as e:        print(e.code)        print(e.reason)    except urllib.error.URLError as e:        print(e.reason)    return e.reason
chrome_options=wu_visual()option=fan_jiance()chrome_path =r'./chromedriver.exe'bro = webdriver.Chrome(executable_path=chrome_path,chrome_options=chrome_options,options=option)
#用火狐的朋友可以把下一行代碼的“#”去掉即可#bro = webdriver.Firefox()
bro.maximize_window() #最大化url = r'http://kns.cnki.net' #知網(wǎng)網(wǎng)址bro.get(url)

三、關(guān)鍵詞搜索

#模擬輸入關(guān)鍵字查詢#請(qǐng)選擇您需要使用的查詢方式,本代碼只提供標(biāo)題查詢input_title = bro.find_element_by_id("txt_SearchText")input_title.click()time.sleep(2)key_value = input("請(qǐng)輸入你要下載的論文標(biāo)題:")
input_title.send_keys(key_value)#點(diǎn)擊搜索div_search = bro.find_element_by_xpath('/html/body/div[1]/div[2]/div/div[1]/input[2]')div_search.click()time.sleep(1)#點(diǎn)擊期刊論文default_1=20bro.find_element_by_xpath("/html/body/div[5]/div[1]/div/ul[1]/li[1]/a/span").click()time.sleep(10)total_num = bro.find_element_by_xpath("/html/body/div[5]/div[1]/div/ul[1]/li[1]/a/em")if int(total_num.text)<=default_1:    print("一共搜索到"+total_num.text+"條結(jié)果")    print("共一頁")else:    print("一共搜索到" + total_num.text + "條結(jié)果")    total_page =bro.find_element_by_xpath('/[@id="gridTable"]/div[2]/span[1]')    print(total_page.text)    num =int(total_page.text[1:-1])

四、選擇下載格式及批量下載到幾頁

print("1:PDF格式2:CAJ格式請(qǐng)輸入下載文件的格式對(duì)應(yīng)數(shù)字:")load_num = int(input("請(qǐng)輸入1 or 2:"))
print("請(qǐng)輸入您要下載到第幾頁碼:")

五、開始批量下載

load_page = int(input())while load_page>num or load_page<=0:    print("輸入頁碼錯(cuò)誤,請(qǐng)重新輸入:")    load_page = int(input("請(qǐng)輸入1 or 2:"))bro_new = webdriver.Chrome(executable_path=chrome_path, chrome_options=chrome_options,options=option)if int(total_num.text)<=default_1:    url_link = bro.find_elements_by_xpath('/[@id="gridTable"]/table/tbody/tr/td[2]/a')    for link_1 in url_link:        count=1        link = url + r'/kcms/detail/detail.a(chǎn)spx?' + link_1.get_attribute("href")[20:]        bro_new = webdriver.Chrome(executable_path=chrome_path,chrome_options=chrome_options,options=option)        bro_new.get(link)        bro_new.maximize_window()        # print("編號(hào)為"+str(count)+"的論文:"+bro_new.find_element_by_xpath("/html/body/div[2]/div[1]/div[3]/div/div[1]/div[3]/div[1]/h1").text+"————正在下載")        time.sleep(10)        if bro_new.find_element_by_xpath('/html/body/div[2]/div').text == "URL參數(shù)錯(cuò)誤":            print("編號(hào)為"+str(count)+"的論文:"+bro_new.find_element_by_xpath("/html/body/div[2]/div[1]/div[3]/div/div[1]/div[3]/div[1]/h1").text+"————論文下載失。ⅲ            bro_new.quit()            count += 1            continue        if load_num == 1:            bro_new.find_element_by_id('pdfDown').click()            time.sleep(10)            print("編號(hào)為" + str(count) + "的論文:" + bro_new.find_element_by_xpath("/html/body/div[2]/div[1]/div[3]/div/div[1]/div[3]/div[1]/h1").text + "————下載成功")            count += 1            bro_new.quit()        if load_num == 2:            bro_new.find_element_by_id('cajDown').click()            time.sleep(10)            print("編號(hào)為" + str(count) + "的論文:" + bro_new.find_element_by_xpath("/html/body/div[2]/div[1]/div[3]/div/div[1]/div[3]/div[1]/h1").text + "————下載成功")            count += 1            bro_new.quit()else:    for ii in range(0,load_page):        count=1        url_link = bro.find_elements_by_xpath('/[@id="gridTable"]/table/tbody/tr/td[2]/a')        for link_1 in url_link:            link = url + r'/kcms/detail/detail.a(chǎn)spx?' + link_1.get_attribute("href")[20:]            bro_new = webdriver.Chrome(executable_path=chrome_path,chrome_options=chrome_options,options=option)            bro_new.get(link)            bro_new.maximize_window()            time.sleep(10)            if bro_new.find_element_by_xpath('/html/body/div[2]/div').text == "URL參數(shù)錯(cuò)誤":                bro_new.quit()                print("編號(hào)為" + str(count) + "的論文:" + bro_new.find_element_by_xpath("/html/body/div[2]/div[1]/div[3]/div/div[1]/div[3]/div[1]/h1").text + "————論文下載失。ⅲ                bro_new.quit()                count += 1                continue            if load_num == 1:                bro_new.find_element_by_name('pdfDown').click()                time.sleep(10)                print("編號(hào)為" + str(count) + "的論文:" + bro_new.find_element_by_xpath("/html/body/div[2]/div[1]/div[3]/div/div[1]/div[3]/div[1]/h1").text + "————下載成功")                count += 1                bro_new.quit()            if load_num == 2:                bro_new.find_element_by_name('cajDown').click()                time.sleep(5)                print("編號(hào)為" + str(count) + "的論文:" + bro_new.find_element_by_xpath("/html/body/div[2]/div[1]/div[3]/div/div[1]/div[3]/div[1]/h1").text + "————下載成功")                count += 1                bro_new.quit()        bro.find_element_by_xpath('/[@id="PageNext"]').click()        time.sleep(10)


聲明: 本文由入駐維科號(hào)的作者撰寫,觀點(diǎn)僅代表作者本人,不代表OFweek立場(chǎng)。如有侵權(quán)或其他問題,請(qǐng)聯(lián)系舉報(bào)。

發(fā)表評(píng)論

0條評(píng)論,0人參與

請(qǐng)輸入評(píng)論內(nèi)容...

請(qǐng)輸入評(píng)論/評(píng)論長(zhǎng)度6~500個(gè)字

您提交的評(píng)論過于頻繁,請(qǐng)輸入驗(yàn)證碼繼續(xù)

  • 看不清,點(diǎn)擊換一張  刷新

暫無評(píng)論

暫無評(píng)論

人工智能 獵頭職位 更多
掃碼關(guān)注公眾號(hào)
OFweek人工智能網(wǎng)
獲取更多精彩內(nèi)容
文章糾錯(cuò)
x
*文字標(biāo)題:
*糾錯(cuò)內(nèi)容:
聯(lián)系郵箱:
*驗(yàn) 證 碼:

粵公網(wǎng)安備 44030502002758號(hào)