[筆記] Python 爬蟲 PTT 八卦版 @地瓜大的飛翔旅程

章節連結

課程名稱
課程相關文章
指令

本文為 Hahow 上的 Python 網頁爬蟲入門實戰的書籍版(Python：網路爬蟲與資料分析入門實戰)課程心得，其對應的章節為 Chapter 3。本篇參照了書上的範例程式碼，並稍加改善和新增一些註解。

課程名稱

Python 網頁爬蟲入門實戰：https://bit.ly/2U6wElg
對於爬蟲初學者而言，算是滿不錯的搭配教材。如有需要，你可以搭配「Python：網路爬蟲與資料分析入門實戰」這本書來看。

課程相關文章

[筆記] Python 爬蟲實戰 – PPT 表特版和圖片下載

[筆記] Python 爬蟲 BeautifulSoup 的進階運用

[筆記] Python 爬蟲初探 BeautifulSoup

指令

import requests
import datetime
import json
from bs4 import BeautifulSoup
PTT_url = "https://www.ptt.cc"

def get_webPage(url):
    res = requests.get(url,cookies = {'over18': '1'})
    if res.status_code !=200:
        print("Invalid URL",res.url)
        return none
    else:
        return res.text

def get_articles(page,date):
    soup = BeautifulSoup(page,'html5lib')
    
    #上一頁連結位置
    prevURL = soup.select('.btn-group-paging a')[1]['href']
    
    #取得文章清單
    articles = []
    divs = soup.select('.r-ent')
    for article in divs:
        if article.find('div','date').text.strip() == date:
            #取得推文數
            pushCount = 0
            pushString = article.find('div','nrec').text
            if pushString:
                try:
                    pushCount = int(pushString) #將字串轉換成數字
                except ValueError:
                    if pushString == "爆":
                        pushCount = 99
                    elif pushString.startswith('X'):
                        pushCount = -10
            if article.find('a'):
                title = article.find('a').text #取得文章標頭
                href = article.find('a')['href'] #取得文章連結
                author = article.find('div','author').text #取得作者名
                articles.append({
                    'title':title,
                    'href':href,
                    'pushCount':pushCount,
                    'author':author
                })
    return articles, prevURL

# 程式會由以下的地方開始執行，用 "if __name__ == "__main__":" 宣告
# https://blog.csdn.net/yjk13703623757/article/details/77918633
if __name__ == "__main__":
    allArticles = []
    #取得頁面
    currentPage = get_webPage(PTT_url+'/bbs/Gossiping/index.html')
    # 取得電腦端時間資料
    todayRoot = datetime.date.today()
    #tomorrowRoot = todayRoot + datetime.timedelta(days=1)
    # 更新為 PTT 時間格式，並去掉開頭的'0' 
    today = todayRoot.strftime("%m/%d").lstrip('0')
    #tomorrow = tomorrowRoot.strftime("%m/%d").lstrip('0')
    
    articles, prevURL = get_articles(currentPage,today)
    #當有符合日期的文章回傳時，搜尋上一頁是否有文章
    while articles:
        allArticles += articles
        currentPage = get_webPage(PTT_url+prevURL)
        articles, prevURL = get_articles(currentPage,today)
        
    #擷取文章總覽
    print('今天有', len(allArticles), '篇文章')
    threshold = 50 #定義熱門文章門檻
    print('熱門文章(> %d 推):' % (threshold))
    for article in allArticles:
            if int(article['pushCount']) > threshold:
                print(article['title'],PTT_url+article['href'])
    #儲存成 json 檔案，將日期的格式再做更改
    #https://ithelp.ithome.com.tw/articles/10161708
    today = todayRoot.strftime("%Y%m%d")
    fileName = 'gossiping-' + today
    with open('{}.json'.format(fileName),'w',encoding='utf-8') as file:
        json.dump(allArticles,file,indent=2,sort_keys=True, ensure_ascii=False)

import requests

import datetime

import json

from bs4 import BeautifulSoup

PTT_url = "https://www.ptt.cc"

def get_webPage(url):

res = requests.get(url,cookies = {'over18': '1'})

if res.status_code !=200:

print("Invalid URL",res.url)

return none

else:

return res.text

def get_articles(page,date):

soup = BeautifulSoup(page,'html5lib')

#上一頁連結位置

prevURL = soup.select('.btn-group-paging a')[1]['href']

#取得文章清單

articles = []

divs = soup.select('.r-ent')

for article in divs:

if article.find('div','date').text.strip() == date:

#取得推文數

pushCount = 0

pushString = article.find('div','nrec').text

if pushString:

try:

pushCount = int(pushString) #將字串轉換成數字

except ValueError:

if pushString == "爆":

pushCount = 99

elif pushString.startswith('X'):

pushCount = -10

if article.find('a'):

title = article.find('a').text #取得文章標頭

href = article.find('a')['href'] #取得文章連結

author = article.find('div','author').text #取得作者名

articles.append({

'title':title,

'href':href,

'pushCount':pushCount,

'author':author

})

return articles, prevURL

# 程式會由以下的地方開始執行，用 "if __name__ == "__main__":" 宣告

# https://blog.csdn.net/yjk13703623757/article/details/77918633

if __name__ == "__main__":

allArticles = []

#取得頁面

currentPage = get_webPage(PTT_url+'/bbs/Gossiping/index.html')

# 取得電腦端時間資料

todayRoot = datetime.date.today()

#tomorrowRoot = todayRoot + datetime.timedelta(days=1)

# 更新為 PTT 時間格式，並去掉開頭的'0'

today = todayRoot.strftime("%m/%d").lstrip('0')

#tomorrow = tomorrowRoot.strftime("%m/%d").lstrip('0')

articles, prevURL = get_articles(currentPage,today)

#當有符合日期的文章回傳時，搜尋上一頁是否有文章

while articles:

allArticles += articles

currentPage = get_webPage(PTT_url+prevURL)

articles, prevURL = get_articles(currentPage,today)

#擷取文章總覽

print('今天有', len(allArticles), '篇文章')

threshold = 50 #定義熱門文章門檻

print('熱門文章(> %d 推):' % (threshold))

for article in allArticles:

if int(article['pushCount']) > threshold:

print(article['title'],PTT_url+article['href'])

#儲存成 json 檔案，將日期的格式再做更改

#https://ithelp.ithome.com.tw/articles/10161708

today = todayRoot.strftime("%Y%m%d")

fileName = 'gossiping-' + today

with open('{}.json'.format(fileName),'w',encoding='utf-8') as file:

json.dump(allArticles,file,indent=2,sort_keys=True, ensure_ascii=False)

按讚加入粉絲團

課程名稱

課程相關文章

指令

延伸閱讀