您好:参考书本范例,并有修改一些小问题pttbeauty.json是有撷取资料但images内没有图档请问,是图档有权限,还是还有哪边设定需要再处理(范本较旧)谢谢
import scrapy
from Ch9_4a.items import BeautyItem
from datetime import datetime
class PptbeautySpider(scrapy.Spider):
name = "pttbeauty"
allowed_domains = ["ptt.cc"]
start_urls = ["https://www.ptt.cc/bbs/Beauty/index.html"]
def __init__(self):
self.max_pages = 2 # 最大页数
self.num_of_pages = 0 # 目前已爬取的页数
def parse(self, response):
for href in response.css(".r-ent > div.title > a::attr(href)"):
url = response.urljoin(href.extract())
#print("-------------------url-AA--------------------------",url)
yield scrapy.Request(url, callback=self.parse_post)
#print("-------------------parse---------------------------")
self.num_of_pages = self.num_of_pages + 1
#print("-------------------parse------self.num_of_pages---------------------", self.num_of_pages)
# 是否已经到达最大页数
if self.num_of_pages < self.max_pages:
prev_page = response.xpath(\'//div[@id="action-bar-container"]//a[contains(text(), "上页")]/@href\')
if prev_page: # 是否有上一页
# print("-------------------prev_page[0]---------------------------",prev_page[0])
url = response.urljoin(prev_page[0].extract())
# print("-------------------url---------------------------",url)
yield scrapy.Request(url, self.parse)
else:
print("已经是最后一页, 总共页数: ", self.num_of_pages)
else:
print("已经到达最大页数: ", self.max_pages)
def parse_post(self, response):
#print("-------------------parse_post---------------------------")
item = BeautyItem()
#item["author"] = response.css(".article-metaline:nth-child(1) .article-meta-value::text").extract_first()
item["author"] = response.css(".article-metaline:nth-child(1) .article-meta-value::text").extract_first()
item["title"] = response.css(".article-metaline-right+ .article-metaline .article-meta-value::text").extract_first()
#article-metaline-right+ 下一个
print( item["author"], item["title"])
datetime_str = response.css(".article-metaline+ .article-metaline .article-meta-value::text").extract_first()
item["date"] = datetime.strptime(datetime_str, \'%a %b %d %H:%M:%S %Y\')
score = 0
num_of_pushes = 0
comments = response.xpath(\'//div[@class="push"]\')
for comment in comments:
push = comment.css("span.push-tag::text")[0].extract()
if "推" in push:
score = score + 1
num_of_pushes = num_of_pushes + 1
elif "嘘" in push:
score = score - 1
item["score"] = score
item["pushes"] = num_of_pushes
item["comments"] = len(comments)
item["url"] = response.url
img_urls = response.xpath(\'//a[contains(@href, "imgur.com")]/@href\').extract()
if img_urls:
img_urls = [url for url in img_urls if url.endswith(".jpg")]
item["images"] = len(img_urls)
item["file_urls"] = img_urls
else:
item["images"] = 0
item["file_urls"] = []
yield item
items
import scrapy
class BeautyItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
author = scrapy.Field()
date = scrapy.Field()
score = scrapy.Field()
pushes = scrapy.Field()
comments = scrapy.Field()
url = scrapy.Field()
images = scrapy.Field()
file_urls = scrapy.Field()
pipelines.py
from itemadapter import ItemAdapter
class Ch94APipeline:
def process_item(self, item, spider):
return item
settings.py
BOT_NAME = \'Ch9_4a\'
SPIDER_MODULES = [\'Ch9_4a.spiders\']
NEWSPIDER_MODULE = \'Ch9_4a.spiders\'
# 输出 JSON 资料
#FEED_FORMAT = "json"
#FEED_URI = "pttbeauty.json"
#FEED_EXPORT_ENCODING = "utf-8"
FEEDS = {
\'pttbeauty.json\': {
\'format\': \'json\',
\'encoding\': \'utf8\',
\'store_empty\': False,
\'fields\': None,
\'indent\': 4,
},
}
ITEM_PIPELINES = {
\'scrapy.pipelines.files.FilesPipeline\': 1
}
FILES_STORE = \'images\'
CONCURRENT_REQUESTS_PER_DOMAIN = 1
DOWNLOAD_DELAY = 5
pttbeauty.json其中一段{"author": "tactics2100 (Ose)","title": "[正妹] 冈田さりお 开球","date": "2024-12-07 15:34:37","score": 46,"pushes": 53,"comments": 90,"url": "https://www.ptt.cc/bbs/Beauty/M.1733556879.A.F3E.html","images": 2,"file_urls": ["http://i.imgur.com/SzEH8BS.jpg","http://i.imgur.com/KTiZOUy.jpg"]}
且在专案底下,也有自动产生images 资料夹但就是没有图片
不知道是否真这些EXCEPTION有关
--- <exception caught here> ---
C:\\ProgramData\\anaconda3\\Lib\\site-packages\\twisted\\internet\\defer.py:1075:_runCallbacks
C:\\ProgramData\\anaconda3\\Lib\\site-packages\\scrapy\\pipelines\\files.py:459:media_failed
]
NoneType: None
2024-12-09 22:39:02 [scrapy.pipelines.media] ERROR: [Failure instance: Traceback: <class \'scrapy.pipelines.files.FileException\'>:
C:\\ProgramData\\anaconda3\\Lib\\site-packages\\twisted\\internet\\defer.py:533:addCallbacks
C:\\ProgramData\\anaconda3\\Lib\\site-packages\\twisted\\internet\\defer.py:1075:_runCallbacks
C:\\ProgramData\\anaconda3\\Lib\\site-packages\\scrapy\\pipelines\\media.py:197:_check_media_to_download
C:\\ProgramData\\anaconda3\\Lib\\site-packages\\twisted\\internet\\defer.py:533:addCallbacks
1 个回答
1
hokou
iT邦好手 1 级 ‧ 2024-12-10 08:24:00
可以看看 "C:\\images"
有没有
或是把 FILES_STORE = \'images\'
里面的路径改成相对或绝对路径看看
"./images"
"D:\\images"
应该是没找到资料夹
-
4 -
-
看更多先前的...收起先前的...
noway
iT邦研究生 1 级 ‧
2024-12-11 18:17:28
您好:这之前都是过,就是没图片下载
且都有产生images资料夹,但里面就是没有档案
#FILES_STORE = \'./images\'
FILES_STORE = \'D:\\images\'
#FILES_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)),\'images\')
修改
hokou
iT邦好手 1 级 ‧
2024-12-11 21:21:19
allowed_domains = ["ptt.cc"]
加入 i.imgur.com 看看
或是不要设
修改
noway
iT邦研究生 1 级 ‧
2024-12-15 18:26:29
您好:
试过
allowed_domains = ["ptt.cc"]
allowed_domains = ["ptt.cc","i.imgur.com"]
allowed_domains = []
但 都有建立images,但没有图片下载
修改
hokou
iT邦好手 1 级 ‧
2024-12-16 08:26:45
FILES_STORE = \'path/to/your/files\'
MEDIA_ALLOW_REDIRECTS = True # 若下载链接有重定向
另外我实际试
http://i.imgur.com/SzEH8BS.jpg
点了网址会变成
https://imgur.com/SzEH8BS
从这去看图片网址是 .jpeg
也许是格式差异
https://i.imgur.com/SzEH8BS.jpeg
修改