craws


Namecraws JSON
Version 0.1.1 PyPI version JSON
download
home_pagehttps://github.com/markadc/craws
Summary爬虫者
upload_time2024-12-28 15:10:13
maintainerNone
docs_urlNone
authorWangTuo
requires_pythonNone
licenseMIT
keywords python spider
VCS
bugtrack_url
requirements No requirements were recorded.
Travis-CI No Travis.
coveralls test coverage No coveralls.
            # 项目说明

- ...

# 更新历史

- 加入爬虫案例

# 爬虫案列

## 爬取皮肤图片

### 代码

```python

import json
import os
import re
from concurrent.futures import ThreadPoolExecutor

from loguru import logger

from craws import AirSpider


class WZRYSkinSpider(AirSpider):
    def parse_url(self, url, b=False):
        try:
            res = self.get(url)
            res.encoding = "GBK"
            assert res.status_code == 200, "Code not is 200"
            return res.content if b else res.text
        except:
            pass

    def download_img(self, img_url, hero_name, hero_img, num):
        b_data = self.parse_url(img_url, b=True)
        if b_data is None:
            return
        with open(hero_img, "wb") as f:
            f.write(b_data)
        logger.success(f"{hero_name} 第{num}张皮肤图片 下载完毕")

    def process_hero(self, hero_id, name):
        logger.info(f"{hero_id}\t{name}\t处理中...")

        hero_dir = f"./英雄皮肤/{name}"
        if not os.path.exists(hero_dir):
            os.makedirs(hero_dir, exist_ok=True)

        with ThreadPoolExecutor(max_workers=20) as pool:
            for num in range(1, 20):
                hero_img = f"{hero_dir}/皮肤_{num}.png"
                if os.path.exists(hero_img):
                    logger.warning(f"{hero_img}已下载过,跳过")
                    continue
                img_url = f"https://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/{hero_id}/{hero_id}-bigskin-{num}.jpg"
                pool.submit(self.download_img, img_url, name, hero_img, num)

    def crawl(self):
        api_url = "https://game.gtimg.cn/images/yxzj/web201706/js/heroid.js"
        text = self.parse_url(api_url)
        search_result = re.search('var module_exports = ({.*?})', text, re.S)
        hero_info_str = search_result.group(1)
        hero_info_str = re.sub("'", '"', hero_info_str)
        hero_info_dict = json.loads(hero_info_str)

        with ThreadPoolExecutor(max_workers=10) as pool:
            for hero in hero_info_dict:
                name, id = hero_info_dict[hero], hero
                pool.submit(self.process_hero, id, name)


if __name__ == '__main__':
    WZRYSkinSpider().crawl()

```

            

Raw data

            {
    "_id": null,
    "home_page": "https://github.com/markadc/craws",
    "name": "craws",
    "maintainer": null,
    "docs_url": null,
    "requires_python": null,
    "maintainer_email": null,
    "keywords": "Python, Spider",
    "author": "WangTuo",
    "author_email": "markadc@126.com",
    "download_url": "https://files.pythonhosted.org/packages/ba/36/c45773710f668de77b35ebd8e1e896b4ee23755b2053ba4126e75cb09432/craws-0.1.1.tar.gz",
    "platform": null,
    "description": "# \u9879\u76ee\u8bf4\u660e\n\n- ...\n\n# \u66f4\u65b0\u5386\u53f2\n\n- \u52a0\u5165\u722c\u866b\u6848\u4f8b\n\n# \u722c\u866b\u6848\u5217\n\n## \u722c\u53d6\u76ae\u80a4\u56fe\u7247\n\n### \u4ee3\u7801\n\n```python\n\nimport json\nimport os\nimport re\nfrom concurrent.futures import ThreadPoolExecutor\n\nfrom loguru import logger\n\nfrom craws import AirSpider\n\n\nclass WZRYSkinSpider(AirSpider):\n    def parse_url(self, url, b=False):\n        try:\n            res = self.get(url)\n            res.encoding = \"GBK\"\n            assert res.status_code == 200, \"Code not is 200\"\n            return res.content if b else res.text\n        except:\n            pass\n\n    def download_img(self, img_url, hero_name, hero_img, num):\n        b_data = self.parse_url(img_url, b=True)\n        if b_data is None:\n            return\n        with open(hero_img, \"wb\") as f:\n            f.write(b_data)\n        logger.success(f\"{hero_name} \u7b2c{num}\u5f20\u76ae\u80a4\u56fe\u7247 \u4e0b\u8f7d\u5b8c\u6bd5\")\n\n    def process_hero(self, hero_id, name):\n        logger.info(f\"{hero_id}\\t{name}\\t\u5904\u7406\u4e2d...\")\n\n        hero_dir = f\"./\u82f1\u96c4\u76ae\u80a4/{name}\"\n        if not os.path.exists(hero_dir):\n            os.makedirs(hero_dir, exist_ok=True)\n\n        with ThreadPoolExecutor(max_workers=20) as pool:\n            for num in range(1, 20):\n                hero_img = f\"{hero_dir}/\u76ae\u80a4_{num}.png\"\n                if os.path.exists(hero_img):\n                    logger.warning(f\"{hero_img}\u5df2\u4e0b\u8f7d\u8fc7\uff0c\u8df3\u8fc7\")\n                    continue\n                img_url = f\"https://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/{hero_id}/{hero_id}-bigskin-{num}.jpg\"\n                pool.submit(self.download_img, img_url, name, hero_img, num)\n\n    def crawl(self):\n        api_url = \"https://game.gtimg.cn/images/yxzj/web201706/js/heroid.js\"\n        text = self.parse_url(api_url)\n        search_result = re.search('var module_exports = ({.*?})', text, re.S)\n        hero_info_str = search_result.group(1)\n        hero_info_str = re.sub(\"'\", '\"', hero_info_str)\n        hero_info_dict = json.loads(hero_info_str)\n\n        with ThreadPoolExecutor(max_workers=10) as pool:\n            for hero in hero_info_dict:\n                name, id = hero_info_dict[hero], hero\n                pool.submit(self.process_hero, id, name)\n\n\nif __name__ == '__main__':\n    WZRYSkinSpider().crawl()\n\n```\n",
    "bugtrack_url": null,
    "license": "MIT",
    "summary": "\u722c\u866b\u8005",
    "version": "0.1.1",
    "project_urls": {
        "Homepage": "https://github.com/markadc/craws"
    },
    "split_keywords": [
        "python",
        " spider"
    ],
    "urls": [
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "ba36c45773710f668de77b35ebd8e1e896b4ee23755b2053ba4126e75cb09432",
                "md5": "3ce38414db7cead2578e93a0e2e80919",
                "sha256": "1f59f65dbd8b30d13cbc048672fb819cc06f58a691be7aca1a4797b8fa1fd1cb"
            },
            "downloads": -1,
            "filename": "craws-0.1.1.tar.gz",
            "has_sig": false,
            "md5_digest": "3ce38414db7cead2578e93a0e2e80919",
            "packagetype": "sdist",
            "python_version": "source",
            "requires_python": null,
            "size": 8550,
            "upload_time": "2024-12-28T15:10:13",
            "upload_time_iso_8601": "2024-12-28T15:10:13.196385Z",
            "url": "https://files.pythonhosted.org/packages/ba/36/c45773710f668de77b35ebd8e1e896b4ee23755b2053ba4126e75cb09432/craws-0.1.1.tar.gz",
            "yanked": false,
            "yanked_reason": null
        }
    ],
    "upload_time": "2024-12-28 15:10:13",
    "github": true,
    "gitlab": false,
    "bitbucket": false,
    "codeberg": false,
    "github_user": "markadc",
    "github_project": "craws",
    "github_not_found": true,
    "lcname": "craws"
}
        
Elapsed time: 0.50642s