pader


Namepader JSON
Version 0.3 PyPI version JSON
download
home_pagehttps://github.com/markadc/sqlman
Summary轻量级的爬虫框架,支持中间件、检验等功能
upload_time2024-05-07 16:06:04
maintainerNone
docs_urlNone
authorWangTuo
requires_pythonNone
licenseMIT
keywords python spider
VCS
bugtrack_url
requirements No requirements were recorded.
Travis-CI No Travis.
coveralls test coverage No coveralls.
            # 项目说明

- 轻量框架,支持中间件、检验等功能。用法与Scrapy、Feapder类似。

# Python解释器

- python3

# 如何使用pader?

#### 使用SlowSpider

- 单线程爬虫

```python
from loguru import logger

import pader


class TestSpider(pader.SlowSpider):
    start_urls = ['https://www.baidu.com']

    def when_spider_start(self):
        print('爬虫开始了...')

    def when_spider_close(self):
        print('...爬虫结束了')

    def parse(self, request, response):
        lis = response.xpath('//ul[@id="hotsearch-content-wrapper"]/li')
        for li in lis:
            url = li.xpath('./a/@href').get()
            title = li.xpath('./a/span[last()]/text()').get()
            logger.success(title)
            logger.success(url)
            logger.info('\r')
            yield pader.Request(url, callback=self.parse_detail)

    def parse_detail(self, request, response):
        nodes = response.xpath('//div[@class="c-container"]//h3')
        for node in nodes:
            some = node.xpath('./a//text()').getall()
            title = ''.join(some)
            url = node.xpath('./a/@href').get()
            logger.success(title)
            logger.success(url)

    def middleware(self, request):
        request.mark = '百度首页' if request.callback.__name__ == 'parse' else '百度搜索页'
        logger.info('进入了中间件,已设置记号为{}'.format(request.mark))

    def validate(self, request, response):
        logger.warning('进入了校验,记号={}'.format(request.mark))


if __name__ == '__main__':
    TestSpider().crawl()

```

#### 使用PaderSpider

- 多线程爬虫

```python
import threading
import time

from loguru import logger

import pader


def t_name():
    return threading.current_thread().name


def show(request):
    logger.success("回调: {}  =>  线程: {}".format(request.callback.__name__, t_name()))


URL = "https://www.baidu.com/s?&wd=python3"


class TestSpider(pader.PaderSpider):
    def start_requests(self):
        for i in range(5):
            yield pader.Request(URL)

    def when_spider_start(self):
        logger.info('爬虫开始了...')

    def when_spider_close(self):
        logger.info('...爬虫结束了')

    def parse(self, request, response):
        show(request)
        for i in range(2):
            mark = 'parse-{}'.format(i + 1)
            yield pader.Request(URL, mark=mark, callback=self.parse_list)

    def parse_list(self, request, response):
        show(request)
        for i in range(3):
            mark = 'parse_list-{}'.format(i + 1)
            yield pader.Request(URL, mark=mark, callback=self.parse_detail)

    def parse_detail(self, request, response):
        show(request)

    def middleware(self, request):
        time.sleep(1)  # 睡眠1S方便看出并发效果


if __name__ == '__main__':
    TestSpider(speed=5, qsize=10).crawl()

```

            

Raw data

            {
    "_id": null,
    "home_page": "https://github.com/markadc/sqlman",
    "name": "pader",
    "maintainer": null,
    "docs_url": null,
    "requires_python": null,
    "maintainer_email": null,
    "keywords": "Python, Spider",
    "author": "WangTuo",
    "author_email": "markadc@126.com",
    "download_url": "https://files.pythonhosted.org/packages/25/08/e143b1167a1dedd432cd19df25716fb9ab93082f529252878ed742affb7e/pader-0.3.tar.gz",
    "platform": null,
    "description": "# \u9879\u76ee\u8bf4\u660e\n\n- \u8f7b\u91cf\u6846\u67b6\uff0c\u652f\u6301\u4e2d\u95f4\u4ef6\u3001\u68c0\u9a8c\u7b49\u529f\u80fd\u3002\u7528\u6cd5\u4e0eScrapy\u3001Feapder\u7c7b\u4f3c\u3002\n\n# Python\u89e3\u91ca\u5668\n\n- python3\n\n# \u5982\u4f55\u4f7f\u7528pader\uff1f\n\n#### \u4f7f\u7528SlowSpider\n\n- \u5355\u7ebf\u7a0b\u722c\u866b\n\n```python\nfrom loguru import logger\n\nimport pader\n\n\nclass TestSpider(pader.SlowSpider):\n    start_urls = ['https://www.baidu.com']\n\n    def when_spider_start(self):\n        print('\u722c\u866b\u5f00\u59cb\u4e86...')\n\n    def when_spider_close(self):\n        print('...\u722c\u866b\u7ed3\u675f\u4e86')\n\n    def parse(self, request, response):\n        lis = response.xpath('//ul[@id=\"hotsearch-content-wrapper\"]/li')\n        for li in lis:\n            url = li.xpath('./a/@href').get()\n            title = li.xpath('./a/span[last()]/text()').get()\n            logger.success(title)\n            logger.success(url)\n            logger.info('\\r')\n            yield pader.Request(url, callback=self.parse_detail)\n\n    def parse_detail(self, request, response):\n        nodes = response.xpath('//div[@class=\"c-container\"]//h3')\n        for node in nodes:\n            some = node.xpath('./a//text()').getall()\n            title = ''.join(some)\n            url = node.xpath('./a/@href').get()\n            logger.success(title)\n            logger.success(url)\n\n    def middleware(self, request):\n        request.mark = '\u767e\u5ea6\u9996\u9875' if request.callback.__name__ == 'parse' else '\u767e\u5ea6\u641c\u7d22\u9875'\n        logger.info('\u8fdb\u5165\u4e86\u4e2d\u95f4\u4ef6\uff0c\u5df2\u8bbe\u7f6e\u8bb0\u53f7\u4e3a{}'.format(request.mark))\n\n    def validate(self, request, response):\n        logger.warning('\u8fdb\u5165\u4e86\u6821\u9a8c\uff0c\u8bb0\u53f7={}'.format(request.mark))\n\n\nif __name__ == '__main__':\n    TestSpider().crawl()\n\n```\n\n#### \u4f7f\u7528PaderSpider\n\n- \u591a\u7ebf\u7a0b\u722c\u866b\n\n```python\nimport threading\nimport time\n\nfrom loguru import logger\n\nimport pader\n\n\ndef t_name():\n    return threading.current_thread().name\n\n\ndef show(request):\n    logger.success(\"\u56de\u8c03: {}  =>  \u7ebf\u7a0b: {}\".format(request.callback.__name__, t_name()))\n\n\nURL = \"https://www.baidu.com/s?&wd=python3\"\n\n\nclass TestSpider(pader.PaderSpider):\n    def start_requests(self):\n        for i in range(5):\n            yield pader.Request(URL)\n\n    def when_spider_start(self):\n        logger.info('\u722c\u866b\u5f00\u59cb\u4e86...')\n\n    def when_spider_close(self):\n        logger.info('...\u722c\u866b\u7ed3\u675f\u4e86')\n\n    def parse(self, request, response):\n        show(request)\n        for i in range(2):\n            mark = 'parse-{}'.format(i + 1)\n            yield pader.Request(URL, mark=mark, callback=self.parse_list)\n\n    def parse_list(self, request, response):\n        show(request)\n        for i in range(3):\n            mark = 'parse_list-{}'.format(i + 1)\n            yield pader.Request(URL, mark=mark, callback=self.parse_detail)\n\n    def parse_detail(self, request, response):\n        show(request)\n\n    def middleware(self, request):\n        time.sleep(1)  # \u7761\u77201S\u65b9\u4fbf\u770b\u51fa\u5e76\u53d1\u6548\u679c\n\n\nif __name__ == '__main__':\n    TestSpider(speed=5, qsize=10).crawl()\n\n```\n",
    "bugtrack_url": null,
    "license": "MIT",
    "summary": "\u8f7b\u91cf\u7ea7\u7684\u722c\u866b\u6846\u67b6\uff0c\u652f\u6301\u4e2d\u95f4\u4ef6\u3001\u68c0\u9a8c\u7b49\u529f\u80fd",
    "version": "0.3",
    "project_urls": {
        "Homepage": "https://github.com/markadc/sqlman"
    },
    "split_keywords": [
        "python",
        " spider"
    ],
    "urls": [
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "2508e143b1167a1dedd432cd19df25716fb9ab93082f529252878ed742affb7e",
                "md5": "6f8badc00648765a932a1b7ad682f292",
                "sha256": "9b5b957867a3203f7f46e074190a4c00177acc16798bed8ca06dc612c46abc7c"
            },
            "downloads": -1,
            "filename": "pader-0.3.tar.gz",
            "has_sig": false,
            "md5_digest": "6f8badc00648765a932a1b7ad682f292",
            "packagetype": "sdist",
            "python_version": "source",
            "requires_python": null,
            "size": 6439,
            "upload_time": "2024-05-07T16:06:04",
            "upload_time_iso_8601": "2024-05-07T16:06:04.995629Z",
            "url": "https://files.pythonhosted.org/packages/25/08/e143b1167a1dedd432cd19df25716fb9ab93082f529252878ed742affb7e/pader-0.3.tar.gz",
            "yanked": false,
            "yanked_reason": null
        }
    ],
    "upload_time": "2024-05-07 16:06:04",
    "github": true,
    "gitlab": false,
    "bitbucket": false,
    "codeberg": false,
    "github_user": "markadc",
    "github_project": "sqlman",
    "github_not_found": true,
    "lcname": "pader"
}
        
Elapsed time: 0.23400s