# Scrapy util
基于scrapy 的一些扩展
pypi: [https://pypi.org/project/scrapy-util](https://pypi.org/project/scrapy-util)
github: [https://github.com/mouday/scrapy-util](https://github.com/mouday/scrapy-util)
```bash
pip install six scrapy-util
```
## 启用数据收集功能
此功能配合 [spider-admin-pro](https://github.com/mouday/spider-admin-pro) 使用
```python
# 设置收集运行日志的路径,会以post方式向 spider-admin-pro 提交json数据
# 注意:此处配置仅为示例,请设置为 spider-admin-pro 的真实路径
# 假设,我们的 spider-admin-pro 运行在http://127.0.0.1:5001
STATS_COLLECTION_URL = "http://127.0.0.1:5001/api/statsCollection/addItem"
# 启用数据收集扩展
EXTENSIONS = {
# ===========================================
# 可选:如果收集到的时间是utc时间,可以使用本地时间扩展收集
'scrapy.extensions.corestats.CoreStats': None,
'scrapy_util.extensions.LocaltimeCoreStats': 0,
# ===========================================
# 可选,打印程序运行时长
'scrapy_util.extensions.ShowDurationExtension': 100,
# 启用数据收集扩展
'scrapy_util.extensions.StatsCollectorExtension': 100
}
```
## 使用脚本Spider
仅做脚本执行,Request 不请求网络
```python
# -*- coding: utf-8 -*-
from scrapy import cmdline
from scrapy_util.spiders import ScriptSpider
class BaiduScriptSpider(ScriptSpider):
name = 'baidu_script'
def execute(self):
print("hi")
if __name__ == '__main__':
cmdline.execute('scrapy crawl baidu_script'.split())
```
## 列表爬虫
ListNextRequestSpider基于 ListSpider 实现,如需自定义缓存,可以重写其中的方法
```python
# -*- coding: utf-8 -*-
from scrapy import cmdline
from scrapy_util.spiders import ListNextRequestSpider
class BaiduListSpider(ListNextRequestSpider):
name = 'list_spider'
page_key = "list_spider"
# 必须实现的方法
def get_url(self, page):
return 'http://127.0.0.1:5000/list?page=' + str(page)
def parse(self, response):
print(response.text)
# 调用下一页,该方法会在start_requests 方法自动调用一次
# 如果不继续翻页,可以不调用
yield self.next_request(response)
if __name__ == '__main__':
cmdline.execute('scrapy crawl list_spider'.split())
```
## MongoDB中间件
使用示例
settings.py
```python
# 1、设置MongoDB 的数据库地址
MONGO_URI = "mongodb://localhost:27017/"
# 2、启用中间件MongoPipeline
ITEM_PIPELINES = {
'scrapy_util.pipelines.MongoPipeline': 100,
}
```
```python
# -*- coding: utf-8 -*-
import scrapy
from scrapy import cmdline
from scrapy_util.items import MongoItem
class BaiduMongoSpider(scrapy.Spider):
name = 'baidu_mongo'
start_urls = ['http://baidu.com/']
# 1、设置数据库的表名
custom_settings = {
'MONGO_DATABASE': 'data',
'MONGO_TABLE': 'table'
}
def parse(self, response):
title = response.css('title::text').extract_first()
item = {
'data': {
'title': title
}
}
# 2、返回 MongoItem
return MongoItem(item)
if __name__ == '__main__':
cmdline.execute('scrapy crawl baidu_mongo'.split())
```
如果需要做微调,可以继承`MongoPipeline` 重写函数
## 工具方法
运行爬虫工具方法
```python
import scrapy
from scrapy_util import spider_util
class BaiduSpider(scrapy.Spider):
name = 'baidu_spider'
if __name__ == '__main__':
# cmdline.execute('scrapy crawl baidu_spider'.split()
spider_util.run_spider(BaiduSpider)
```
Raw data
{
"_id": null,
"home_page": "https://github.com/mouday/scrapy-util",
"name": "scrapy-util",
"maintainer": null,
"docs_url": null,
"requires_python": null,
"maintainer_email": null,
"keywords": "spider, admin",
"author": "Peng Shiyu",
"author_email": "pengshiyuyx@gmail.com",
"download_url": "https://files.pythonhosted.org/packages/b9/88/4524f6c470bdb3e8422df6cec6bddbe9e150f11a4d91ac422ea2cc0340ad/scrapy-util-1.0.3.tar.gz",
"platform": null,
"description": "# Scrapy util\n\n\u57fa\u4e8escrapy \u7684\u4e00\u4e9b\u6269\u5c55\n\npypi: [https://pypi.org/project/scrapy-util](https://pypi.org/project/scrapy-util)\n\ngithub: [https://github.com/mouday/scrapy-util](https://github.com/mouday/scrapy-util)\n\n\n```bash\npip install six scrapy-util\n```\n\n## \u542f\u7528\u6570\u636e\u6536\u96c6\u529f\u80fd\n\n\u6b64\u529f\u80fd\u914d\u5408 [spider-admin-pro](https://github.com/mouday/spider-admin-pro) \u4f7f\u7528\n\n```python\n\n# \u8bbe\u7f6e\u6536\u96c6\u8fd0\u884c\u65e5\u5fd7\u7684\u8def\u5f84,\u4f1a\u4ee5post\u65b9\u5f0f\u5411 spider-admin-pro \u63d0\u4ea4json\u6570\u636e\n# \u6ce8\u610f\uff1a\u6b64\u5904\u914d\u7f6e\u4ec5\u4e3a\u793a\u4f8b\uff0c\u8bf7\u8bbe\u7f6e\u4e3a spider-admin-pro \u7684\u771f\u5b9e\u8def\u5f84\n# \u5047\u8bbe\uff0c\u6211\u4eec\u7684 spider-admin-pro \u8fd0\u884c\u5728http://127.0.0.1:5001\nSTATS_COLLECTION_URL = \"http://127.0.0.1:5001/api/statsCollection/addItem\"\n\n# \u542f\u7528\u6570\u636e\u6536\u96c6\u6269\u5c55\nEXTENSIONS = {\n # ===========================================\n # \u53ef\u9009\uff1a\u5982\u679c\u6536\u96c6\u5230\u7684\u65f6\u95f4\u662futc\u65f6\u95f4\uff0c\u53ef\u4ee5\u4f7f\u7528\u672c\u5730\u65f6\u95f4\u6269\u5c55\u6536\u96c6\n 'scrapy.extensions.corestats.CoreStats': None,\n 'scrapy_util.extensions.LocaltimeCoreStats': 0,\n # ===========================================\n\n # \u53ef\u9009\uff0c\u6253\u5370\u7a0b\u5e8f\u8fd0\u884c\u65f6\u957f\n 'scrapy_util.extensions.ShowDurationExtension': 100,\n\n # \u542f\u7528\u6570\u636e\u6536\u96c6\u6269\u5c55\n 'scrapy_util.extensions.StatsCollectorExtension': 100\n}\n\n```\n\n## \u4f7f\u7528\u811a\u672cSpider\n\n\u4ec5\u505a\u811a\u672c\u6267\u884c\uff0cRequest \u4e0d\u8bf7\u6c42\u7f51\u7edc\n\n```python\n# -*- coding: utf-8 -*-\n\nfrom scrapy import cmdline\n\nfrom scrapy_util.spiders import ScriptSpider\n\n\nclass BaiduScriptSpider(ScriptSpider):\n name = 'baidu_script'\n\n def execute(self):\n print(\"hi\")\n\n\nif __name__ == '__main__':\n cmdline.execute('scrapy crawl baidu_script'.split())\n\n```\n\n## \u5217\u8868\u722c\u866b\n\nListNextRequestSpider\u57fa\u4e8e ListSpider \u5b9e\u73b0\uff0c\u5982\u9700\u81ea\u5b9a\u4e49\u7f13\u5b58\uff0c\u53ef\u4ee5\u91cd\u5199\u5176\u4e2d\u7684\u65b9\u6cd5\n\n```python\n# -*- coding: utf-8 -*-\n\nfrom scrapy import cmdline\nfrom scrapy_util.spiders import ListNextRequestSpider\n\n\nclass BaiduListSpider(ListNextRequestSpider):\n name = 'list_spider'\n\n page_key = \"list_spider\"\n\n # \u5fc5\u987b\u5b9e\u73b0\u7684\u65b9\u6cd5\n def get_url(self, page):\n return 'http://127.0.0.1:5000/list?page=' + str(page)\n\n def parse(self, response):\n print(response.text)\n\n # \u8c03\u7528\u4e0b\u4e00\u9875\uff0c\u8be5\u65b9\u6cd5\u4f1a\u5728start_requests \u65b9\u6cd5\u81ea\u52a8\u8c03\u7528\u4e00\u6b21\n # \u5982\u679c\u4e0d\u7ee7\u7eed\u7ffb\u9875\uff0c\u53ef\u4ee5\u4e0d\u8c03\u7528\n yield self.next_request(response)\n\n\nif __name__ == '__main__':\n cmdline.execute('scrapy crawl list_spider'.split())\n\n```\n\n## MongoDB\u4e2d\u95f4\u4ef6\n\n\u4f7f\u7528\u793a\u4f8b\n\nsettings.py\n```python\n# 1\u3001\u8bbe\u7f6eMongoDB \u7684\u6570\u636e\u5e93\u5730\u5740\nMONGO_URI = \"mongodb://localhost:27017/\"\n\n# 2\u3001\u542f\u7528\u4e2d\u95f4\u4ef6MongoPipeline\nITEM_PIPELINES = {\n 'scrapy_util.pipelines.MongoPipeline': 100,\n}\n\n```\n\n```python\n# -*- coding: utf-8 -*-\n\nimport scrapy\nfrom scrapy import cmdline\nfrom scrapy_util.items import MongoItem\n\n\nclass BaiduMongoSpider(scrapy.Spider):\n name = 'baidu_mongo'\n\n start_urls = ['http://baidu.com/']\n\n # 1\u3001\u8bbe\u7f6e\u6570\u636e\u5e93\u7684\u8868\u540d\n custom_settings = {\n 'MONGO_DATABASE': 'data',\n 'MONGO_TABLE': 'table'\n }\n\n def parse(self, response):\n title = response.css('title::text').extract_first()\n\n item = {\n 'data': {\n 'title': title\n }\n }\n\n # 2\u3001\u8fd4\u56de MongoItem\n return MongoItem(item)\n\n\nif __name__ == '__main__':\n cmdline.execute('scrapy crawl baidu_mongo'.split())\n\n```\n\n\u5982\u679c\u9700\u8981\u505a\u5fae\u8c03\uff0c\u53ef\u4ee5\u7ee7\u627f`MongoPipeline` \u91cd\u5199\u51fd\u6570\n\n\n## \u5de5\u5177\u65b9\u6cd5\n\n\u8fd0\u884c\u722c\u866b\u5de5\u5177\u65b9\u6cd5\n\n```python\nimport scrapy\nfrom scrapy_util import spider_util\n\nclass BaiduSpider(scrapy.Spider):\n name = 'baidu_spider'\n\n\nif __name__ == '__main__':\n # cmdline.execute('scrapy crawl baidu_spider'.split()\n spider_util.run_spider(BaiduSpider)\n```\n\n\n\n",
"bugtrack_url": null,
"license": "MIT",
"summary": "scrapy util",
"version": "1.0.3",
"project_urls": {
"Homepage": "https://github.com/mouday/scrapy-util"
},
"split_keywords": [
"spider",
" admin"
],
"urls": [
{
"comment_text": "",
"digests": {
"blake2b_256": "a2ed9531e861680750e5b4acd6a5cdfaf9a903f3a416a581c48d4cea3ed607c1",
"md5": "c90eabc294f2b281c5ed92f367f18cf2",
"sha256": "c38ac362a6cb588c066ee98b2c4d004e5d4977e1ea9130925f915936cd0c910b"
},
"downloads": -1,
"filename": "scrapy_util-1.0.3-py2.py3-none-any.whl",
"has_sig": false,
"md5_digest": "c90eabc294f2b281c5ed92f367f18cf2",
"packagetype": "bdist_wheel",
"python_version": "py2.py3",
"requires_python": null,
"size": 13420,
"upload_time": "2024-04-15T03:14:35",
"upload_time_iso_8601": "2024-04-15T03:14:35.243954Z",
"url": "https://files.pythonhosted.org/packages/a2/ed/9531e861680750e5b4acd6a5cdfaf9a903f3a416a581c48d4cea3ed607c1/scrapy_util-1.0.3-py2.py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": "",
"digests": {
"blake2b_256": "b9884524f6c470bdb3e8422df6cec6bddbe9e150f11a4d91ac422ea2cc0340ad",
"md5": "f83187b8e856164aeda04d445fa82d97",
"sha256": "654b3d3c45115b8f3243d1648b8cb79907d13dca7c6ae11e112a75191d2ac5aa"
},
"downloads": -1,
"filename": "scrapy-util-1.0.3.tar.gz",
"has_sig": false,
"md5_digest": "f83187b8e856164aeda04d445fa82d97",
"packagetype": "sdist",
"python_version": "source",
"requires_python": null,
"size": 9386,
"upload_time": "2024-04-15T03:14:36",
"upload_time_iso_8601": "2024-04-15T03:14:36.874705Z",
"url": "https://files.pythonhosted.org/packages/b9/88/4524f6c470bdb3e8422df6cec6bddbe9e150f11a4d91ac422ea2cc0340ad/scrapy-util-1.0.3.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2024-04-15 03:14:36",
"github": true,
"gitlab": false,
"bitbucket": false,
"codeberg": false,
"github_user": "mouday",
"github_project": "scrapy-util",
"travis_ci": false,
"coveralls": false,
"github_actions": true,
"requirements": [
{
"name": "six",
"specs": []
},
{
"name": "md5util",
"specs": []
},
{
"name": "pymongo",
"specs": []
},
{
"name": "requests",
"specs": []
},
{
"name": "scrapy",
"specs": []
},
{
"name": "mo-cache",
"specs": []
}
],
"lcname": "scrapy-util"
}