實在沒招了。求大神幫忙修改這段Python爬蟲代碼。-有解無憂

這是爬某報紙的，本人新手水平有限，反復修改幾天了爬不到內容

，煩請大神幫忙改改代碼。萬分感謝。

import requests
import bs4
import os
import datetime
import time
def fetchUrl(url):
    '''
    功能：訪問 url 的網頁，獲取網頁內容并回傳
    引數：目標網頁的 url
    回傳：目標網頁的 html 內容
    '''
    headers = {
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
    }
    r = requests.get(url, headers=headers)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    return r.text
def getPageList(year, month, day):
    '''
    功能：獲取當天報紙的各版面的鏈接串列
    引數：年，月，日
    '''
    url = 'https://www.shobserver.com/staticsg/res/html/journal/index.html?date=' + year + '-' + month + '-' + day + '&page=01'
    html = fetchUrl(url)
    bsobj = bs4.BeautifulSoup(html, 'html.parser')
    pageList = bsobj.find('div', attrs={'class': 'dd-box'}).find_all('dd')
    linkList = []
    for page in pageList:
        tempList = page.find_all('a')
        for temp in tempList:
            link = temp["href"]
            if 'index.html' in link:
                url = 'https://www.shobserver.com/staticsg/res/html/journal/' + link
        linkList.append(url)
    return linkList
def getTitleList(year, month, day, pageUrl):
    '''
    功能：獲取報紙某一版面的文章鏈接串列
    引數：年，月，日，該版面的鏈接
    '''
    html = fetchUrl(pageUrl)
    bsobj = bs4.BeautifulSoup(html, 'html.parser')
    titleList = bsobj.find('div', attrs={'class': 'dd-box news-list'}).find_all('dd')
    linkList = []
    for title in titleList:
        tempList = title.find_all('a')
        for temp in tempList:
            link = temp["href"]
            if 'detail.html' in link:
                url = 'https://www.shobserver.com/staticsg/res/html/journal/' + link
        linkList.append(url)
    return linkList
def getContent(html):
    '''
    功能：決議 HTML 網頁，獲取新聞的文章內容
    引數：html 網頁內容
    '''
    bsobj = bs4.BeautifulSoup(html, 'html.parser')
    # 獲取文章標題
    title = bsobj.find_all('div', attrs={'class': 'con-title'})
    content1 = ''
    for p1 in title:
        content1 += p1.text + '\n'
        # print(content1)
    # 獲取文章內容
    pList = bsobj.find_all('div', attrs={'class': 'txt-box'})
    content = ''
    for p in pList:
        content += p.text + '\n'
        # print(content)
    # 回傳結果標題+內容
    resp = content1 + content
    return resp
def saveFile(content, path, filename):
    '''
    功能：將文章內容 content 保存到本地檔案中
    引數：要保存的內容，路徑，檔案名
    '''
    # 如果沒有該檔案夾，則自動生成
    if not os.path.exists(path):
        os.makedirs(path)
    # 保存檔案
    with open(path + filename, 'w', encoding='utf-8') as f:
        f.write(content)
def download_rmrb(year, month, day, destdir):
    '''
    功能：網站某年某月某日的新聞內容，并保存在指定目錄下
    引數：年，月，日，檔案保存的根目錄
    '''
    pageList = getPageList(year, month, day)
    for page in pageList:
        titleList = getTitleList(year, month, day, page)
        for url in titleList:
            # 獲取新聞文章內容
            html = fetchUrl(url)
            content = getContent(html)
            # 生成保存的檔案路徑及檔案名
            temp = url.split('=')[-2].split('&')[0].split('-')
            pageNo = temp[0]
            titleNo = temp[0] if int(temp[0]) >= 10 else '0' + temp[0]
            path = destdir + '/' + year + month + day + '/'
            fileName = year + month + day + '-' + pageNo + '-' + titleNo + '.txt'
            # 保存檔案
            saveFile(content, path, fileName)
if __name__ == '__main__':
    '''
    主函式：程式入口
    '''
    # 爬取指定日期的新聞
    newsDate = input('請輸入要爬取的日期（格式如 20210916 ）:')
    year = newsDate[0:4]
    month = newsDate[4:6]
    day = newsDate[6:8]
    download_rmrb(year, month, day, 'D:02/cqrb')
    print("爬取完成：" + year + month + day)

轉載請註明出處，本文鏈接：https://www.uj5u.com/qita/276791.html

標籤：腳本語言(Perl/Python)

上一篇：寫腳本的大神來個

下一篇：球球各位大神幫幫孩子吧，孩子的python作業怎么也做不出來，馬上就截止了，球球各位大神幫幫孩子吧