import requests
import datetime
import json
import re
import os
import urllib.request
from bs4 import BeautifulSoup
PTT_url = "https://www.ptt.cc"
def get_webPage(url):
res = requests.get(url,cookies = {'over18': '1'})
if res.status_code !=200:
print("Invalid URL",res.url)
return none
else:
return res.text
def get_articles(page,date):
soup = BeautifulSoup(page,'html5lib')
#上一頁連結位置
prevURL = soup.select('.btn-group-paging a')[1]['href']
#取得文章清單
articles = []
divs = soup.select('.r-ent')
for article in divs:
if article.find('div','date').text.strip() == date:
#取得推文數
pushCount = 0
pushString = article.find('div','nrec').text
if pushString:
try:
pushCount = int(pushString) #將字串轉換成數字
except ValueError:
if pushString == "爆":
pushCount = 99
elif pushString.startswith('X'):
pushCount = -10
if article.find('a'):
title = article.find('a').text #取得文章標頭
href = article.find('a')['href'] #取得文章連結
author = article.find('div','author').text #取得作者名
articles.append({
'title':title,
'href':href,
'pushCount':pushCount,
'author':author
})
return articles, prevURL
if __name__ == "__main__":
allArticles = []
# 取得表特版頁面
currentPage = get_webPage(PTT_url+'/bbs/Beauty/index.html')
# 取得電腦端時間資料
todayRoot = datetime.date.today()
# 更新為 PTT 時間格式,並去掉開頭的'0'
today = todayRoot.strftime("%m/%d").lstrip('0')
articles, prevURL = get_articles(currentPage,today)
#當有符合日期的文章回傳時,搜尋上一頁是否有文章
while articles:
allArticles += articles
currentPage = get_webPage(PTT_url+prevURL)
articles, prevURL = get_articles(currentPage,today)
#擷取文章總覽
print('今天有', len(allArticles), '篇文章')
threshold = 10 #定義熱門文章門檻
print('熱門文章(> %d 推):' % (threshold))
for article in allArticles:
if int(article['pushCount']) > threshold:
print(article['title'],PTT_url+article['href'])
# 再次進入文章,進行圖片爬取
url = PTT_url+article['href']
newRequest = get_webPage(url)
soup = BeautifulSoup(newRequest,'html5lib')
# 找尋符合的 img 圖片網址
imgLinks = soup.findAll('a',{'href':re.compile('https:\/\/(imgur|i\.imgur)\.com\/.*.jpg$')})
# 依照文章標題建立資料夾
folderName = article['title'].strip() # 去除多餘空格
os.makedirs(folderName)
if len(imgLinks)>0:
try:
for imgLink in imgLinks:
print (imgLink['href'])
fileName = imgLink['href'].split("/")[-1]
urllib.request.urlretrieve(imgLink['href'], os.path.join(folderName, fileName))
except Exception as e:
print(e)