# ReSpider
## 开始一个爬虫
> 爬虫继承自 <ReSpider.Spider> 类
```
import ReSpider
class TestSpider(ReSpider.Spider):
# 自定义配置
__custom_setting__ = {}
start_urls = []
def start_requests(self):
pass
def parse(self, response):
pass
if __name__ == '__main__':
TestSpider().start()
```
## 通过命令创建
```
respider create -p project_name # 创建项目
respider create -s spider_name # 创建爬虫程序, 需要符合命名规范
```
## 自定义设置
```
__custom_setting__ = {
'TASK_LIMIT': 1, # 设置并发数, 默认为1
'SCHEDULER': 'ReSpider.extend.redis.scheduler.RedisScheduler', # 设置任务队列, 默认为内存
'DOWNLOAD_DELAY': 1, # 下载延迟, 默认为0
'RETRY_ENABLED': True, # 重试, 设置为True开启重试, 默认关闭
'SSL_FINGERPRINT': False # ssl指纹, 默认关闭, 在创建ssl上下文时会阻塞, 开启后并发会降为1
}
```
## 中间件设置
```
# 管道
ITEM_PIPELINES = {
'ReSpider.pipelines.files.CSVPipeline': 4,
'ReSpider.pipelines.redis.RedisPipeline': 5,
'ReSpider.pipelines.files.FilesPipeline': 6,
'ReSpider.pipelines.mongodb.MongoDBPipeline': 8
}
# 下载中间件
DOWNLOADER_MIDDLEWARES = {
'ReSpider.middlewares.useragent.UserAgentMiddleware': 2,
# 'ReSpider.extend.puppeteer.downloadmiddleware.PuppeteerMiddleware': 5,
'ReSpider.middlewares.retry.RetryMiddleware': 8
}
```
## 信号 (待开发)
### 添加信号参数
- 解决任务完成时偶先程序无法正常停止的问题(浏览器渲染下浏览器无法关闭)
- 中间件和管道增加关闭标志(is_closed)
- 根据传递的信号参数对是否关闭标志进行赋值,根据标志来开关中间件或管道
## 保存数据
### Item
一般的数据实体
```
from ReSpider import item
data = DataItem({'name': 'ReSpider'}, **kwargs)
```
### xxListItem
数据实体列表, 可以传入一个list来构造
```
from ReSpider import item
data_list_item = DataListItem([1, 2, 3], **kwargs)
```
### 保存数据
```
# 二进制数据
io_item = item.IoItem(b'hello world', filename='hello world', filetype='bin')
# 文件类型, filetype 为文件类型
file_item = item.FileItem('hello world', filename='hello world', filetype='text')
# 表格数据
csv_item = item.CSVItem({'name': '张三', 'age': 14}, filename='hello world')
# 多行使用list
csv_list = item.CSVListItem([{'name': '张三', 'age': 14}, {'name': '李四', 'age': 19}], filename='法外狂徒')
# 使用yield来保存数据
data = item.DataItem()
yield data
```
## Log
### 日志设置
```
__custom_setting__ = {
'LOG_PATH': None, # log文件写入位置
'LOG_TO_CONSOLE': True, # 输出日志到控制台, 默认开启
'LOG_LEVEL_CONSOLE': 'DEBUG', # 输出日志到控制台级别, 默认DEBUG
'LOG_TO_FILE': False, # 输出日志到文件, 默认关闭, 设置为True开启
'LOG_LEVEL_FILE': 'WARNING' # 输出日志到文件级别, 默认WARNING
}
```
## JS渲染
### 无头模式下cookie问题
Raw data
{
"_id": null,
"home_page": "https://github.com/zaoxg/ReSpiderFramework",
"name": "ReSpider",
"maintainer": "",
"docs_url": null,
"requires_python": ">=3.7",
"maintainer_email": "",
"keywords": "respider",
"author": "zhaoxiangpeng",
"author_email": "zhaoxiangpengSR@gmail.com",
"download_url": "",
"platform": null,
"description": "# ReSpider\r\n\r\n## \u5f00\u59cb\u4e00\u4e2a\u722c\u866b\r\n> \u722c\u866b\u7ee7\u627f\u81ea <ReSpider.Spider> \u7c7b\r\n```\r\nimport ReSpider\r\n\r\n\r\nclass TestSpider(ReSpider.Spider):\r\n # \u81ea\u5b9a\u4e49\u914d\u7f6e\r\n __custom_setting__ = {}\r\n start_urls = []\r\n\r\n def start_requests(self):\r\n pass\r\n\r\n def parse(self, response):\r\n pass\r\n\r\n\r\nif __name__ == '__main__':\r\n TestSpider().start() \r\n```\r\n\r\n## \u901a\u8fc7\u547d\u4ee4\u521b\u5efa\r\n```\r\nrespider create -p project_name # \u521b\u5efa\u9879\u76ee\r\nrespider create -s spider_name # \u521b\u5efa\u722c\u866b\u7a0b\u5e8f, \u9700\u8981\u7b26\u5408\u547d\u540d\u89c4\u8303\r\n```\r\n\r\n## \u81ea\u5b9a\u4e49\u8bbe\u7f6e\r\n```\r\n__custom_setting__ = {\r\n 'TASK_LIMIT': 1, # \u8bbe\u7f6e\u5e76\u53d1\u6570, \u9ed8\u8ba4\u4e3a1\r\n 'SCHEDULER': 'ReSpider.extend.redis.scheduler.RedisScheduler', # \u8bbe\u7f6e\u4efb\u52a1\u961f\u5217, \u9ed8\u8ba4\u4e3a\u5185\u5b58\r\n 'DOWNLOAD_DELAY': 1, # \u4e0b\u8f7d\u5ef6\u8fdf, \u9ed8\u8ba4\u4e3a0\r\n 'RETRY_ENABLED': True, # \u91cd\u8bd5, \u8bbe\u7f6e\u4e3aTrue\u5f00\u542f\u91cd\u8bd5, \u9ed8\u8ba4\u5173\u95ed\r\n 'SSL_FINGERPRINT': False # ssl\u6307\u7eb9, \u9ed8\u8ba4\u5173\u95ed, \u5728\u521b\u5efassl\u4e0a\u4e0b\u6587\u65f6\u4f1a\u963b\u585e, \u5f00\u542f\u540e\u5e76\u53d1\u4f1a\u964d\u4e3a1 \r\n}\r\n```\r\n\r\n## \u4e2d\u95f4\u4ef6\u8bbe\u7f6e\r\n```\r\n# \u7ba1\u9053\r\nITEM_PIPELINES = {\r\n 'ReSpider.pipelines.files.CSVPipeline': 4,\r\n 'ReSpider.pipelines.redis.RedisPipeline': 5,\r\n 'ReSpider.pipelines.files.FilesPipeline': 6,\r\n 'ReSpider.pipelines.mongodb.MongoDBPipeline': 8\r\n}\r\n\r\n# \u4e0b\u8f7d\u4e2d\u95f4\u4ef6\r\nDOWNLOADER_MIDDLEWARES = {\r\n 'ReSpider.middlewares.useragent.UserAgentMiddleware': 2,\r\n # 'ReSpider.extend.puppeteer.downloadmiddleware.PuppeteerMiddleware': 5,\r\n 'ReSpider.middlewares.retry.RetryMiddleware': 8\r\n}\r\n```\r\n\r\n## \u4fe1\u53f7 (\u5f85\u5f00\u53d1)\r\n### \u6dfb\u52a0\u4fe1\u53f7\u53c2\u6570\r\n- \u89e3\u51b3\u4efb\u52a1\u5b8c\u6210\u65f6\u5076\u5148\u7a0b\u5e8f\u65e0\u6cd5\u6b63\u5e38\u505c\u6b62\u7684\u95ee\u9898(\u6d4f\u89c8\u5668\u6e32\u67d3\u4e0b\u6d4f\u89c8\u5668\u65e0\u6cd5\u5173\u95ed)\r\n- \u4e2d\u95f4\u4ef6\u548c\u7ba1\u9053\u589e\u52a0\u5173\u95ed\u6807\u5fd7(is_closed)\r\n- \u6839\u636e\u4f20\u9012\u7684\u4fe1\u53f7\u53c2\u6570\u5bf9\u662f\u5426\u5173\u95ed\u6807\u5fd7\u8fdb\u884c\u8d4b\u503c\uff0c\u6839\u636e\u6807\u5fd7\u6765\u5f00\u5173\u4e2d\u95f4\u4ef6\u6216\u7ba1\u9053\r\n \r\n## \u4fdd\u5b58\u6570\u636e\r\n### Item\r\n\u4e00\u822c\u7684\u6570\u636e\u5b9e\u4f53\r\n```\r\nfrom ReSpider import item\r\ndata = DataItem({'name': 'ReSpider'}, **kwargs)\r\n```\r\n### xxListItem\r\n\u6570\u636e\u5b9e\u4f53\u5217\u8868, \u53ef\u4ee5\u4f20\u5165\u4e00\u4e2alist\u6765\u6784\u9020\r\n```\r\nfrom ReSpider import item\r\ndata_list_item = DataListItem([1, 2, 3], **kwargs)\r\n```\r\n### \u4fdd\u5b58\u6570\u636e\r\n```\r\n# \u4e8c\u8fdb\u5236\u6570\u636e\r\nio_item = item.IoItem(b'hello world', filename='hello world', filetype='bin')\r\n\r\n# \u6587\u4ef6\u7c7b\u578b, filetype \u4e3a\u6587\u4ef6\u7c7b\u578b\r\nfile_item = item.FileItem('hello world', filename='hello world', filetype='text')\r\n\r\n# \u8868\u683c\u6570\u636e\r\ncsv_item = item.CSVItem({'name': '\u5f20\u4e09', 'age': 14}, filename='hello world')\r\n\r\n# \u591a\u884c\u4f7f\u7528list\r\ncsv_list = item.CSVListItem([{'name': '\u5f20\u4e09', 'age': 14}, {'name': '\u674e\u56db', 'age': 19}], filename='\u6cd5\u5916\u72c2\u5f92')\r\n\r\n# \u4f7f\u7528yield\u6765\u4fdd\u5b58\u6570\u636e\r\ndata = item.DataItem()\r\nyield data\r\n```\r\n\r\n## Log\r\n### \u65e5\u5fd7\u8bbe\u7f6e\r\n```\r\n__custom_setting__ = {\r\n 'LOG_PATH': None, # log\u6587\u4ef6\u5199\u5165\u4f4d\u7f6e\r\n 'LOG_TO_CONSOLE': True, # \u8f93\u51fa\u65e5\u5fd7\u5230\u63a7\u5236\u53f0, \u9ed8\u8ba4\u5f00\u542f\r\n 'LOG_LEVEL_CONSOLE': 'DEBUG', # \u8f93\u51fa\u65e5\u5fd7\u5230\u63a7\u5236\u53f0\u7ea7\u522b, \u9ed8\u8ba4DEBUG\r\n 'LOG_TO_FILE': False, # \u8f93\u51fa\u65e5\u5fd7\u5230\u6587\u4ef6, \u9ed8\u8ba4\u5173\u95ed, \u8bbe\u7f6e\u4e3aTrue\u5f00\u542f\r\n 'LOG_LEVEL_FILE': 'WARNING' # \u8f93\u51fa\u65e5\u5fd7\u5230\u6587\u4ef6\u7ea7\u522b, \u9ed8\u8ba4WARNING\r\n}\r\n```\r\n\r\n\r\n## JS\u6e32\u67d3\r\n### \u65e0\u5934\u6a21\u5f0f\u4e0bcookie\u95ee\u9898\r\n",
"bugtrack_url": null,
"license": "",
"summary": "ReSpider\u662f\u4e00\u6b3e\u57fa\u4e8e aiohttp \u8bf7\u6c42\u5e93\u7684python\u722c\u866b\u7a0b\u5e8f",
"version": "1.0.4",
"split_keywords": [
"respider"
],
"urls": [
{
"comment_text": "",
"digests": {
"blake2b_256": "0b16248b98ed4ed2c55e368a3ec31ef7c16eacd4f83ecb595a89b365cd1504b2",
"md5": "f0b57783b98b53e664516712afed6b76",
"sha256": "a320f9a86a3f4322dd7ce4c8f07fbdde94d51a9b354c79310f731fd997b8d9af"
},
"downloads": -1,
"filename": "ReSpider-1.0.4-py3-none-any.whl",
"has_sig": false,
"md5_digest": "f0b57783b98b53e664516712afed6b76",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": ">=3.7",
"size": 89489,
"upload_time": "2023-04-03T05:55:11",
"upload_time_iso_8601": "2023-04-03T05:55:11.827595Z",
"url": "https://files.pythonhosted.org/packages/0b/16/248b98ed4ed2c55e368a3ec31ef7c16eacd4f83ecb595a89b365cd1504b2/ReSpider-1.0.4-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2023-04-03 05:55:11",
"github": true,
"gitlab": false,
"bitbucket": false,
"github_user": "zaoxg",
"github_project": "ReSpiderFramework",
"travis_ci": false,
"coveralls": false,
"github_actions": false,
"lcname": "respider"
}