# 项目说明
- 轻量框架,支持中间件、检验等功能。用法与Scrapy、Feapder类似。
# Python解释器
- python3
# 如何使用pader?
#### 使用SlowSpider
- 单线程爬虫
```python
from loguru import logger
import pader
class TestSpider(pader.SlowSpider):
start_urls = ['https://www.baidu.com']
def when_spider_start(self):
print('爬虫开始了...')
def when_spider_close(self):
print('...爬虫结束了')
def parse(self, request, response):
lis = response.xpath('//ul[@id="hotsearch-content-wrapper"]/li')
for li in lis:
url = li.xpath('./a/@href').get()
title = li.xpath('./a/span[last()]/text()').get()
logger.success(title)
logger.success(url)
logger.info('\r')
yield pader.Request(url, callback=self.parse_detail)
def parse_detail(self, request, response):
nodes = response.xpath('//div[@class="c-container"]//h3')
for node in nodes:
some = node.xpath('./a//text()').getall()
title = ''.join(some)
url = node.xpath('./a/@href').get()
logger.success(title)
logger.success(url)
def middleware(self, request):
request.mark = '百度首页' if request.callback.__name__ == 'parse' else '百度搜索页'
logger.info('进入了中间件,已设置记号为{}'.format(request.mark))
def validate(self, request, response):
logger.warning('进入了校验,记号={}'.format(request.mark))
if __name__ == '__main__':
TestSpider().crawl()
```
#### 使用PaderSpider
- 多线程爬虫
```python
import threading
import time
from loguru import logger
import pader
def t_name():
return threading.current_thread().name
def show(request):
logger.success("回调: {} => 线程: {}".format(request.callback.__name__, t_name()))
URL = "https://www.baidu.com/s?&wd=python3"
class TestSpider(pader.PaderSpider):
def start_requests(self):
for i in range(5):
yield pader.Request(URL)
def when_spider_start(self):
logger.info('爬虫开始了...')
def when_spider_close(self):
logger.info('...爬虫结束了')
def parse(self, request, response):
show(request)
for i in range(2):
mark = 'parse-{}'.format(i + 1)
yield pader.Request(URL, mark=mark, callback=self.parse_list)
def parse_list(self, request, response):
show(request)
for i in range(3):
mark = 'parse_list-{}'.format(i + 1)
yield pader.Request(URL, mark=mark, callback=self.parse_detail)
def parse_detail(self, request, response):
show(request)
def middleware(self, request):
time.sleep(1) # 睡眠1S方便看出并发效果
if __name__ == '__main__':
TestSpider(speed=5, qsize=10).crawl()
```
Raw data
{
"_id": null,
"home_page": "https://github.com/markadc/sqlman",
"name": "pader",
"maintainer": null,
"docs_url": null,
"requires_python": null,
"maintainer_email": null,
"keywords": "Python, Spider",
"author": "WangTuo",
"author_email": "markadc@126.com",
"download_url": "https://files.pythonhosted.org/packages/25/08/e143b1167a1dedd432cd19df25716fb9ab93082f529252878ed742affb7e/pader-0.3.tar.gz",
"platform": null,
"description": "# \u9879\u76ee\u8bf4\u660e\n\n- \u8f7b\u91cf\u6846\u67b6\uff0c\u652f\u6301\u4e2d\u95f4\u4ef6\u3001\u68c0\u9a8c\u7b49\u529f\u80fd\u3002\u7528\u6cd5\u4e0eScrapy\u3001Feapder\u7c7b\u4f3c\u3002\n\n# Python\u89e3\u91ca\u5668\n\n- python3\n\n# \u5982\u4f55\u4f7f\u7528pader\uff1f\n\n#### \u4f7f\u7528SlowSpider\n\n- \u5355\u7ebf\u7a0b\u722c\u866b\n\n```python\nfrom loguru import logger\n\nimport pader\n\n\nclass TestSpider(pader.SlowSpider):\n start_urls = ['https://www.baidu.com']\n\n def when_spider_start(self):\n print('\u722c\u866b\u5f00\u59cb\u4e86...')\n\n def when_spider_close(self):\n print('...\u722c\u866b\u7ed3\u675f\u4e86')\n\n def parse(self, request, response):\n lis = response.xpath('//ul[@id=\"hotsearch-content-wrapper\"]/li')\n for li in lis:\n url = li.xpath('./a/@href').get()\n title = li.xpath('./a/span[last()]/text()').get()\n logger.success(title)\n logger.success(url)\n logger.info('\\r')\n yield pader.Request(url, callback=self.parse_detail)\n\n def parse_detail(self, request, response):\n nodes = response.xpath('//div[@class=\"c-container\"]//h3')\n for node in nodes:\n some = node.xpath('./a//text()').getall()\n title = ''.join(some)\n url = node.xpath('./a/@href').get()\n logger.success(title)\n logger.success(url)\n\n def middleware(self, request):\n request.mark = '\u767e\u5ea6\u9996\u9875' if request.callback.__name__ == 'parse' else '\u767e\u5ea6\u641c\u7d22\u9875'\n logger.info('\u8fdb\u5165\u4e86\u4e2d\u95f4\u4ef6\uff0c\u5df2\u8bbe\u7f6e\u8bb0\u53f7\u4e3a{}'.format(request.mark))\n\n def validate(self, request, response):\n logger.warning('\u8fdb\u5165\u4e86\u6821\u9a8c\uff0c\u8bb0\u53f7={}'.format(request.mark))\n\n\nif __name__ == '__main__':\n TestSpider().crawl()\n\n```\n\n#### \u4f7f\u7528PaderSpider\n\n- \u591a\u7ebf\u7a0b\u722c\u866b\n\n```python\nimport threading\nimport time\n\nfrom loguru import logger\n\nimport pader\n\n\ndef t_name():\n return threading.current_thread().name\n\n\ndef show(request):\n logger.success(\"\u56de\u8c03: {} => \u7ebf\u7a0b: {}\".format(request.callback.__name__, t_name()))\n\n\nURL = \"https://www.baidu.com/s?&wd=python3\"\n\n\nclass TestSpider(pader.PaderSpider):\n def start_requests(self):\n for i in range(5):\n yield pader.Request(URL)\n\n def when_spider_start(self):\n logger.info('\u722c\u866b\u5f00\u59cb\u4e86...')\n\n def when_spider_close(self):\n logger.info('...\u722c\u866b\u7ed3\u675f\u4e86')\n\n def parse(self, request, response):\n show(request)\n for i in range(2):\n mark = 'parse-{}'.format(i + 1)\n yield pader.Request(URL, mark=mark, callback=self.parse_list)\n\n def parse_list(self, request, response):\n show(request)\n for i in range(3):\n mark = 'parse_list-{}'.format(i + 1)\n yield pader.Request(URL, mark=mark, callback=self.parse_detail)\n\n def parse_detail(self, request, response):\n show(request)\n\n def middleware(self, request):\n time.sleep(1) # \u7761\u77201S\u65b9\u4fbf\u770b\u51fa\u5e76\u53d1\u6548\u679c\n\n\nif __name__ == '__main__':\n TestSpider(speed=5, qsize=10).crawl()\n\n```\n",
"bugtrack_url": null,
"license": "MIT",
"summary": "\u8f7b\u91cf\u7ea7\u7684\u722c\u866b\u6846\u67b6\uff0c\u652f\u6301\u4e2d\u95f4\u4ef6\u3001\u68c0\u9a8c\u7b49\u529f\u80fd",
"version": "0.3",
"project_urls": {
"Homepage": "https://github.com/markadc/sqlman"
},
"split_keywords": [
"python",
" spider"
],
"urls": [
{
"comment_text": "",
"digests": {
"blake2b_256": "2508e143b1167a1dedd432cd19df25716fb9ab93082f529252878ed742affb7e",
"md5": "6f8badc00648765a932a1b7ad682f292",
"sha256": "9b5b957867a3203f7f46e074190a4c00177acc16798bed8ca06dc612c46abc7c"
},
"downloads": -1,
"filename": "pader-0.3.tar.gz",
"has_sig": false,
"md5_digest": "6f8badc00648765a932a1b7ad682f292",
"packagetype": "sdist",
"python_version": "source",
"requires_python": null,
"size": 6439,
"upload_time": "2024-05-07T16:06:04",
"upload_time_iso_8601": "2024-05-07T16:06:04.995629Z",
"url": "https://files.pythonhosted.org/packages/25/08/e143b1167a1dedd432cd19df25716fb9ab93082f529252878ed742affb7e/pader-0.3.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2024-05-07 16:06:04",
"github": true,
"gitlab": false,
"bitbucket": false,
"codeberg": false,
"github_user": "markadc",
"github_project": "sqlman",
"github_not_found": true,
"lcname": "pader"
}