Name | spider-ms JSON |
Version |
1.0.2
JSON |
| download |
home_page | https://github.com/musen123 |
Summary | 一个高级爬虫框架 |
upload_time | 2024-06-22 05:44:12 |
maintainer | None |
docs_url | None |
author | MuSen |
requires_python | >=3.8 |
license | None |
keywords |
|
VCS |
|
bugtrack_url |
|
requirements |
No requirements were recorded.
|
Travis-CI |
No Travis.
|
coveralls test coverage |
No coveralls.
|
# ms_spider爬虫框架的使用介绍
### 作者:Musen
## 一、配置文档
### 1、浏览器配置项
```python
# 浏览器类型
BROWSER: str = 'chrome'
# 默认为Flase,如果浏览器需要使用本地登录态,则设置为True,并配置浏览器路径和缓存文件路径
IS_LOCAL_BROWSER: bool = False
# 浏览器路径
BROWSER_PATH: str = r'C:\Program Files\Google\Chrome\Application\chrome.exe'
# 浏览器用户缓存文件路径
USER_DIR: str = r'C:\Users\zengyanzhi\AppData\Local\Google\Chrome\User Data'
# 是否启用调试模式(使用本地的chrome浏览器)
DEBUG = False
# 浏览器远程调试端口()
PORT = 19789
# 是否加载图片(关闭可提升数据抓取效率)
IS_LOAD_IMAGE = True
# 是否使用无头模式
IS_HEADLESS: bool = True
# 翻页操作间隔时间(控制抓取频率,防止反爬)
TIME_INTERVAL: int = random.randint(1, 3)
```
### 2、通用爬虫配置项
```python
# 初始启动的url地址
start_url: str = ''
# 页面数据列表的定位表达式(css或xpath均支持)
data_list_loc: str = ''
```
### 3、自动翻页爬虫
```python
# 自动翻页,下一页按钮的定位表达式(css或xpath均支持)
next_page_btn_loc: str = ""
# 下一页按钮距离页面底部的距离
next_button_distance: int = 200
# 数据分割的标识符(一般不用)
split_str: str = '\n'
# 抓取多少页
pages: int = 1
# 要提取的字段:{key:[v1,v2]}
# key为保存的字段名称,v1为提取的属性,v2为定位表达式(css或xpath均支持)
data_extract_loc = {
'score': ('text', '//span[@class="real font-bold"]'),
'name': ('text', '//span[@class="name font-bold"]'),
'price': ('text', '//span[@class="real-price font-bold"]'),
}
```
- #### 案例:大众点评
```python
class DZDPSpider(BasePageCrawler):
"""大众点评爬虫"""
DEBUG = True
start_url = 'https://www.dianping.com/changsha/ch10/g112'
data_list_loc = '//*[@id="shop-all-list"]/ul/li'
next_page_btn_loc = '//a[text()="下一页"]'
next_button_distance = 200
split_str = '\n'
pages = 2
data_extract_loc = {
'url': ('href', '//div[@class="tit"]/a[1]'),
'name': ('text', '//div[@class="tit"]/a/h4'),
'price': ('text', '//a[@class="mean-price"]'),
'recommend': ('text', '//div[@class="recommend"]'),
}
```
### 4、滚动点击动态加载爬虫
```python
# 动态加载更多的按钮
loader_more_loc = '//div[@class="list-btn-more"]/div'
# 加载的次数(如果加载所有数据会自动停止)
loaders = 20
```
- #### 案例:
```python
class XCJDSpider(ScrollLoaderSpider):
"""携程酒店数据抓取"""
DEBUG = True
data_list_loc = '//li[@class="list-item-target"]'
loader_more_loc = '//div[@class="list-btn-more"]/div'
loaders = 20
data_extract_loc = {
'score': ('text', '//span[@class="real font-bold"]'),
'name': ('text', '//span[@class="name font-bold"]'),
'price': ('text', '//span[@class="real-price font-bold"]'),
}
start_url = 'https://hotels.ctrip.com/hotels/list?countryId=1&city=4&checkin=2024/05/01&checkout=2024/05/03&optionId=4&optionType=City&directSearch=0&display=%E9%87%8D%E5%BA%86&crn=1&adult=1&children=0&searchBoxArg=t&travelPurpose=0&ctm_ref=ix_sb_dl&domestic=1&&highPrice=-1&barCurr=CNY&hotPoi=50%7C50%7C4197113&sort=9'
```
### 5、深度爬虫
```python
# 深度页面打开的间隔时间(需要控制好,不然容易反爬)
open_page_interval: int = 3
# 深度url链接的提取规则
deep_link_url: str = ''
# 深度页面数据提取规则
deep_data_extract_loc: dict = {}
```
- #### 案例:
```python
class DZDPSpider(DeepPageCrawler):
"""大众点评深度爬虫"""
# 是否使用本地chrome
DEBUG = True
open_page_interval = 5
# 起始url
start_url = 'https://www.dianping.com/search/keyword/344/0_%E7%BE%8E%E5%AE%B9'
# 数据列表
data_list_loc = '//div[@id="shop-all-list"]//ul/li'
# 下一页按钮
next_page_btn_loc = "//a[text()='下一页']"
# 下一页距离底部距离
next_button_distance = 200
# 抓取页数
pages = 3
# 列表页数据提取规则
data_extract_loc = {
'url': ('href', '//div[@class="tit"]/a[1]'),
'name': ('text', '//div[@class="tit"]/a/h4'),
'price': ('text', '//a[@class="mean-price"]'),
}
# ==============深度页面抓取=============
# 深度抓取url提取规则
deep_link_url = '//div[@class="tit"]/a[1]'
# 深度抓取数据提取规则
deep_data_extract_loc = {
'addr': ('text', '//div[@class="expand-info address"]'),
"mobile": ('text', '//p[@class="expand-info tel"]')
}
```
## 二、视频图片爬虫案例
### 1、图片爬虫
```python
from spider.media_spider import ImagesSpider
class MZDPSpider(ImagesSpider):
"""图片下载"""
DEBUG = True
# 页面地址
start_url = 'https://www.pexels.com/zh-cn/'
# 图片文件的前缀
image_start_path = "https://images.pexels.com/photos"
# 图片保存的路径
image_save_path = 'D:\projectCode\MusenSpider\images'
def opened(self):
"""打开页面之后的操作"""
if __name__ == '__main__':
MZDPSpider().main()
```
### 2、抖音视频批量抓取
```python
import random
import time
from spider.media_spider import VideoSpider
class DouYinSpider(VideoSpider):
"""抖音up账号视频爬虫"""
# 文件的开头路径
DEBUG = True
# 要抓取的抖音up主的首页地址
start_url: str = 'https://www.douyin.com/user/MS4wLjABAAAAOlZ8ngnt417GKBbFysKt2Q8ERj84-Wb9xypbB8_hmIc?vid=7369137414838684954'
# 视频保存路径
video_save_path: str = r'D:\projectCode\MusenSpider\video\木瓜电影'
# 视频地址前缀
video_start_path: str = 'https://v3-weba.douyinvod.com'
# 下载的视频类型
file_types: list = ['video/mp4']
# 从url中提取文件名的规则
file_name_pattern: str = r'.com/.+?/(.+?)/video'
# 音频文件标签
audio_tag: str = 'media-audio-und-mp4a'
# 视频文件标签
video_tag: str = 'media-video-hvc1'
def opened(self):
# 获取所有的url
a_list = self.page.locator('//ul[@class="e6wsjNLL bGEvyQfj"]//a').all()
print("up主的视频数量:", len(a_list))
for i in range(len(a_list)):
if i == 0:
a_list[i].click()
time.sleep(random.randint(3, 8))
self.page.mouse.wheel(0, 100)
if __name__ == '__main__':
DouYinSpider().main()
```
### 3、快手视频批量抓取
```python
import random
import time
from spider.media_spider import VideoSpider
class KuaiShouSpider(VideoSpider):
"""快手up账号视频爬虫"""
# 文件的开头路径
DEBUG = True
# 要抓取的抖音up主的首页地址
start_url: str = 'https://www.kuaishou.com/profile/3x3fy6cyami7ai6'
# 视频保存路径
video_save_path: str = r'D:\projectCode\MusenSpider\video\快手'
# 视频地址前缀
video_start_path: str = 'https://v3-weba.douyinvod.com'
# 下载的视频类型
file_types: list = ['video/mp4']
# 从url中提取文件名的规则
file_name_pattern: str = r'&clientCacheKey=(.+?)&'
def opened(self):
# 获取所有的url
a_list = self.page.locator('//div[@class="card-link"]').all()
print("up主的视频数量:", len(a_list))
for i in range(len(a_list)):
print(f"下载第{i + 1}个视频")
if i == 0:
a_list[i].click()
time.sleep(random.randint(3, 8))
# 切换下一个视频
self.page.click('//div[@class="switch-item video-switch-next"]')
if __name__ == '__main__':
KuaiShouSpider().main()
```
####
Raw data
{
"_id": null,
"home_page": "https://github.com/musen123",
"name": "spider-ms",
"maintainer": null,
"docs_url": null,
"requires_python": ">=3.8",
"maintainer_email": null,
"keywords": null,
"author": "MuSen",
"author_email": "121292678@qq.com",
"download_url": "https://files.pythonhosted.org/packages/2d/34/59f93b57639bcb81c693c792633e0956457d26542c8ab298fd1e4072ea13/spider_ms-1.0.2.tar.gz",
"platform": null,
"description": "# ms_spider\u722c\u866b\u6846\u67b6\u7684\u4f7f\u7528\u4ecb\u7ecd\r\n\r\n### \u4f5c\u8005\uff1aMusen\r\n\r\n## \u4e00\u3001\u914d\u7f6e\u6587\u6863\r\n\r\n### 1\u3001\u6d4f\u89c8\u5668\u914d\u7f6e\u9879\r\n\r\n\r\n```python\r\n# \u6d4f\u89c8\u5668\u7c7b\u578b\r\nBROWSER: str = 'chrome' \r\n# \u9ed8\u8ba4\u4e3aFlase,\u5982\u679c\u6d4f\u89c8\u5668\u9700\u8981\u4f7f\u7528\u672c\u5730\u767b\u5f55\u6001\uff0c\u5219\u8bbe\u7f6e\u4e3aTrue\uff0c\u5e76\u914d\u7f6e\u6d4f\u89c8\u5668\u8def\u5f84\u548c\u7f13\u5b58\u6587\u4ef6\u8def\u5f84\r\nIS_LOCAL_BROWSER: bool = False\r\n# \u6d4f\u89c8\u5668\u8def\u5f84\r\nBROWSER_PATH: str = r'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe'\r\n \r\n# \u6d4f\u89c8\u5668\u7528\u6237\u7f13\u5b58\u6587\u4ef6\u8def\u5f84\r\nUSER_DIR: str = r'C:\\Users\\zengyanzhi\\AppData\\Local\\Google\\Chrome\\User Data'\r\n \r\n# \u662f\u5426\u542f\u7528\u8c03\u8bd5\u6a21\u5f0f(\u4f7f\u7528\u672c\u5730\u7684chrome\u6d4f\u89c8\u5668)\r\nDEBUG = False\r\n# \u6d4f\u89c8\u5668\u8fdc\u7a0b\u8c03\u8bd5\u7aef\u53e3()\r\nPORT = 19789\r\n\r\n\r\n\r\n\r\n# \u662f\u5426\u52a0\u8f7d\u56fe\u7247\uff08\u5173\u95ed\u53ef\u63d0\u5347\u6570\u636e\u6293\u53d6\u6548\u7387\uff09\r\nIS_LOAD_IMAGE = True\r\n# \u662f\u5426\u4f7f\u7528\u65e0\u5934\u6a21\u5f0f\r\nIS_HEADLESS: bool = True\r\n# \u7ffb\u9875\u64cd\u4f5c\u95f4\u9694\u65f6\u95f4(\u63a7\u5236\u6293\u53d6\u9891\u7387\uff0c\u9632\u6b62\u53cd\u722c)\r\nTIME_INTERVAL: int = random.randint(1, 3)\r\n \r\n \r\n```\r\n\r\n### 2\u3001\u901a\u7528\u722c\u866b\u914d\u7f6e\u9879\r\n\r\n```python\r\n# \u521d\u59cb\u542f\u52a8\u7684url\u5730\u5740\r\nstart_url: str = ''\r\n# \u9875\u9762\u6570\u636e\u5217\u8868\u7684\u5b9a\u4f4d\u8868\u8fbe\u5f0f\uff08css\u6216xpath\u5747\u652f\u6301\uff09\r\ndata_list_loc: str = ''\r\n```\r\n\r\n### 3\u3001\u81ea\u52a8\u7ffb\u9875\u722c\u866b\r\n\r\n```python\r\n# \u81ea\u52a8\u7ffb\u9875\uff0c\u4e0b\u4e00\u9875\u6309\u94ae\u7684\u5b9a\u4f4d\u8868\u8fbe\u5f0f\uff08css\u6216xpath\u5747\u652f\u6301\uff09\r\nnext_page_btn_loc: str = \"\"\r\n# \u4e0b\u4e00\u9875\u6309\u94ae\u8ddd\u79bb\u9875\u9762\u5e95\u90e8\u7684\u8ddd\u79bb\r\nnext_button_distance: int = 200\r\n# \u6570\u636e\u5206\u5272\u7684\u6807\u8bc6\u7b26(\u4e00\u822c\u4e0d\u7528)\r\nsplit_str: str = '\\n'\r\n# \u6293\u53d6\u591a\u5c11\u9875\r\npages: int = 1\r\n# \u8981\u63d0\u53d6\u7684\u5b57\u6bb5\uff1a{key:[v1,v2]}\r\n# key\u4e3a\u4fdd\u5b58\u7684\u5b57\u6bb5\u540d\u79f0\uff0cv1\u4e3a\u63d0\u53d6\u7684\u5c5e\u6027\uff0cv2\u4e3a\u5b9a\u4f4d\u8868\u8fbe\u5f0f\uff08css\u6216xpath\u5747\u652f\u6301\uff09\r\ndata_extract_loc = {\r\n 'score': ('text', '//span[@class=\"real font-bold\"]'),\r\n 'name': ('text', '//span[@class=\"name font-bold\"]'),\r\n 'price': ('text', '//span[@class=\"real-price font-bold\"]'),\r\n }\r\n```\r\n\r\n- #### \u6848\u4f8b\uff1a\u5927\u4f17\u70b9\u8bc4\r\n\r\n ```python\r\n class DZDPSpider(BasePageCrawler):\r\n \"\"\"\u5927\u4f17\u70b9\u8bc4\u722c\u866b\"\"\"\r\n DEBUG = True\r\n start_url = 'https://www.dianping.com/changsha/ch10/g112'\r\n data_list_loc = '//*[@id=\"shop-all-list\"]/ul/li'\r\n next_page_btn_loc = '//a[text()=\"\u4e0b\u4e00\u9875\"]'\r\n next_button_distance = 200\r\n split_str = '\\n'\r\n pages = 2\r\n data_extract_loc = {\r\n 'url': ('href', '//div[@class=\"tit\"]/a[1]'),\r\n 'name': ('text', '//div[@class=\"tit\"]/a/h4'),\r\n 'price': ('text', '//a[@class=\"mean-price\"]'),\r\n 'recommend': ('text', '//div[@class=\"recommend\"]'),\r\n }\r\n ```\r\n\r\n \r\n\r\n### 4\u3001\u6eda\u52a8\u70b9\u51fb\u52a8\u6001\u52a0\u8f7d\u722c\u866b\r\n\r\n```python\r\n# \u52a8\u6001\u52a0\u8f7d\u66f4\u591a\u7684\u6309\u94ae\r\nloader_more_loc = '//div[@class=\"list-btn-more\"]/div'\r\n# \u52a0\u8f7d\u7684\u6b21\u6570(\u5982\u679c\u52a0\u8f7d\u6240\u6709\u6570\u636e\u4f1a\u81ea\u52a8\u505c\u6b62)\r\nloaders = 20\r\n```\r\n\r\n- #### \u6848\u4f8b\uff1a\r\n\r\n```python\r\nclass XCJDSpider(ScrollLoaderSpider):\r\n \"\"\"\u643a\u7a0b\u9152\u5e97\u6570\u636e\u6293\u53d6\"\"\"\r\n DEBUG = True\r\n data_list_loc = '//li[@class=\"list-item-target\"]'\r\n loader_more_loc = '//div[@class=\"list-btn-more\"]/div'\r\n loaders = 20\r\n data_extract_loc = {\r\n 'score': ('text', '//span[@class=\"real font-bold\"]'),\r\n 'name': ('text', '//span[@class=\"name font-bold\"]'),\r\n 'price': ('text', '//span[@class=\"real-price font-bold\"]'),\r\n }\r\n start_url = 'https://hotels.ctrip.com/hotels/list?countryId=1&city=4&checkin=2024/05/01&checkout=2024/05/03&optionId=4&optionType=City&directSearch=0&display=%E9%87%8D%E5%BA%86&crn=1&adult=1&children=0&searchBoxArg=t&travelPurpose=0&ctm_ref=ix_sb_dl&domestic=1&&highPrice=-1&barCurr=CNY&hotPoi=50%7C50%7C4197113&sort=9'\r\n\r\n\r\n```\r\n\r\n\r\n\r\n### 5\u3001\u6df1\u5ea6\u722c\u866b\r\n\r\n```python\r\n# \u6df1\u5ea6\u9875\u9762\u6253\u5f00\u7684\u95f4\u9694\u65f6\u95f4(\u9700\u8981\u63a7\u5236\u597d\uff0c\u4e0d\u7136\u5bb9\u6613\u53cd\u722c)\r\nopen_page_interval: int = 3\r\n# \u6df1\u5ea6url\u94fe\u63a5\u7684\u63d0\u53d6\u89c4\u5219\r\ndeep_link_url: str = ''\r\n# \u6df1\u5ea6\u9875\u9762\u6570\u636e\u63d0\u53d6\u89c4\u5219\r\ndeep_data_extract_loc: dict = {}\r\n```\r\n\r\n- #### \u6848\u4f8b\uff1a\r\n\r\n```python\r\nclass DZDPSpider(DeepPageCrawler):\r\n \"\"\"\u5927\u4f17\u70b9\u8bc4\u6df1\u5ea6\u722c\u866b\"\"\"\r\n # \u662f\u5426\u4f7f\u7528\u672c\u5730chrome\r\n DEBUG = True\r\n open_page_interval = 5\r\n # \u8d77\u59cburl\r\n start_url = 'https://www.dianping.com/search/keyword/344/0_%E7%BE%8E%E5%AE%B9'\r\n # \u6570\u636e\u5217\u8868\r\n data_list_loc = '//div[@id=\"shop-all-list\"]//ul/li'\r\n # \u4e0b\u4e00\u9875\u6309\u94ae\r\n next_page_btn_loc = \"//a[text()='\u4e0b\u4e00\u9875']\"\r\n # \u4e0b\u4e00\u9875\u8ddd\u79bb\u5e95\u90e8\u8ddd\u79bb\r\n next_button_distance = 200\r\n # \u6293\u53d6\u9875\u6570\r\n pages = 3\r\n # \u5217\u8868\u9875\u6570\u636e\u63d0\u53d6\u89c4\u5219\r\n data_extract_loc = {\r\n 'url': ('href', '//div[@class=\"tit\"]/a[1]'),\r\n 'name': ('text', '//div[@class=\"tit\"]/a/h4'),\r\n 'price': ('text', '//a[@class=\"mean-price\"]'),\r\n }\r\n # ==============\u6df1\u5ea6\u9875\u9762\u6293\u53d6=============\r\n # \u6df1\u5ea6\u6293\u53d6url\u63d0\u53d6\u89c4\u5219\r\n deep_link_url = '//div[@class=\"tit\"]/a[1]'\r\n # \u6df1\u5ea6\u6293\u53d6\u6570\u636e\u63d0\u53d6\u89c4\u5219\r\n deep_data_extract_loc = {\r\n 'addr': ('text', '//div[@class=\"expand-info address\"]'),\r\n \"mobile\": ('text', '//p[@class=\"expand-info tel\"]')\r\n }\r\n```\r\n\r\n## \u4e8c\u3001\u89c6\u9891\u56fe\u7247\u722c\u866b\u6848\u4f8b\r\n\r\n### 1\u3001\u56fe\u7247\u722c\u866b\r\n\r\n```python\r\nfrom spider.media_spider import ImagesSpider\r\n\r\n\r\nclass MZDPSpider(ImagesSpider):\r\n \"\"\"\u56fe\u7247\u4e0b\u8f7d\"\"\"\r\n DEBUG = True\r\n # \u9875\u9762\u5730\u5740\r\n start_url = 'https://www.pexels.com/zh-cn/'\r\n # \u56fe\u7247\u6587\u4ef6\u7684\u524d\u7f00\r\n image_start_path = \"https://images.pexels.com/photos\"\r\n # \u56fe\u7247\u4fdd\u5b58\u7684\u8def\u5f84\r\n image_save_path = 'D:\\projectCode\\MusenSpider\\images'\r\n\r\n def opened(self):\r\n \"\"\"\u6253\u5f00\u9875\u9762\u4e4b\u540e\u7684\u64cd\u4f5c\"\"\"\r\n\r\n\r\nif __name__ == '__main__':\r\n MZDPSpider().main()\r\n```\r\n\r\n\r\n\r\n### 2\u3001\u6296\u97f3\u89c6\u9891\u6279\u91cf\u6293\u53d6\r\n\r\n```python\r\nimport random\r\nimport time\r\nfrom spider.media_spider import VideoSpider\r\n\r\n\r\nclass DouYinSpider(VideoSpider):\r\n \"\"\"\u6296\u97f3up\u8d26\u53f7\u89c6\u9891\u722c\u866b\"\"\"\r\n # \u6587\u4ef6\u7684\u5f00\u5934\u8def\u5f84\r\n DEBUG = True\r\n # \u8981\u6293\u53d6\u7684\u6296\u97f3up\u4e3b\u7684\u9996\u9875\u5730\u5740\r\n start_url: str = 'https://www.douyin.com/user/MS4wLjABAAAAOlZ8ngnt417GKBbFysKt2Q8ERj84-Wb9xypbB8_hmIc?vid=7369137414838684954'\r\n # \u89c6\u9891\u4fdd\u5b58\u8def\u5f84\r\n video_save_path: str = r'D:\\projectCode\\MusenSpider\\video\\\u6728\u74dc\u7535\u5f71'\r\n # \u89c6\u9891\u5730\u5740\u524d\u7f00\r\n video_start_path: str = 'https://v3-weba.douyinvod.com'\r\n # \u4e0b\u8f7d\u7684\u89c6\u9891\u7c7b\u578b\r\n file_types: list = ['video/mp4']\r\n # \u4eceurl\u4e2d\u63d0\u53d6\u6587\u4ef6\u540d\u7684\u89c4\u5219\r\n file_name_pattern: str = r'.com/.+?/(.+?)/video'\r\n # \u97f3\u9891\u6587\u4ef6\u6807\u7b7e\r\n audio_tag: str = 'media-audio-und-mp4a'\r\n # \u89c6\u9891\u6587\u4ef6\u6807\u7b7e\r\n video_tag: str = 'media-video-hvc1'\r\n\r\n def opened(self):\r\n # \u83b7\u53d6\u6240\u6709\u7684url\r\n a_list = self.page.locator('//ul[@class=\"e6wsjNLL bGEvyQfj\"]//a').all()\r\n print(\"up\u4e3b\u7684\u89c6\u9891\u6570\u91cf\uff1a\", len(a_list))\r\n for i in range(len(a_list)):\r\n if i == 0:\r\n a_list[i].click()\r\n time.sleep(random.randint(3, 8))\r\n self.page.mouse.wheel(0, 100)\r\n\r\n\r\nif __name__ == '__main__':\r\n DouYinSpider().main()\r\n\r\n```\r\n\r\n\r\n\r\n### 3\u3001\u5feb\u624b\u89c6\u9891\u6279\u91cf\u6293\u53d6\r\n\r\n```python\r\nimport random\r\nimport time\r\nfrom spider.media_spider import VideoSpider\r\n\r\n\r\nclass KuaiShouSpider(VideoSpider):\r\n \"\"\"\u5feb\u624bup\u8d26\u53f7\u89c6\u9891\u722c\u866b\"\"\"\r\n # \u6587\u4ef6\u7684\u5f00\u5934\u8def\u5f84\r\n DEBUG = True\r\n # \u8981\u6293\u53d6\u7684\u6296\u97f3up\u4e3b\u7684\u9996\u9875\u5730\u5740\r\n start_url: str = 'https://www.kuaishou.com/profile/3x3fy6cyami7ai6'\r\n # \u89c6\u9891\u4fdd\u5b58\u8def\u5f84\r\n video_save_path: str = r'D:\\projectCode\\MusenSpider\\video\\\u5feb\u624b'\r\n # \u89c6\u9891\u5730\u5740\u524d\u7f00\r\n video_start_path: str = 'https://v3-weba.douyinvod.com'\r\n # \u4e0b\u8f7d\u7684\u89c6\u9891\u7c7b\u578b\r\n file_types: list = ['video/mp4']\r\n # \u4eceurl\u4e2d\u63d0\u53d6\u6587\u4ef6\u540d\u7684\u89c4\u5219\r\n file_name_pattern: str = r'&clientCacheKey=(.+?)&'\r\n\r\n def opened(self):\r\n # \u83b7\u53d6\u6240\u6709\u7684url\r\n a_list = self.page.locator('//div[@class=\"card-link\"]').all()\r\n print(\"up\u4e3b\u7684\u89c6\u9891\u6570\u91cf\uff1a\", len(a_list))\r\n for i in range(len(a_list)):\r\n print(f\"\u4e0b\u8f7d\u7b2c{i + 1}\u4e2a\u89c6\u9891\")\r\n if i == 0:\r\n a_list[i].click()\r\n time.sleep(random.randint(3, 8))\r\n # \u5207\u6362\u4e0b\u4e00\u4e2a\u89c6\u9891\r\n self.page.click('//div[@class=\"switch-item video-switch-next\"]')\r\n\r\n\r\nif __name__ == '__main__':\r\n KuaiShouSpider().main()\r\n```\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n#### \r\n",
"bugtrack_url": null,
"license": null,
"summary": "\u4e00\u4e2a\u9ad8\u7ea7\u722c\u866b\u6846\u67b6",
"version": "1.0.2",
"project_urls": {
"Homepage": "https://github.com/musen123"
},
"split_keywords": [],
"urls": [
{
"comment_text": "",
"digests": {
"blake2b_256": "f273064ac7b3d4e3cfd56a72d635a472354272871ef952a6b5c8445aa727f2c2",
"md5": "8237e4aec7826e218ec5b092722065de",
"sha256": "f46f236e76862670f8acb014d9f577033fc2974dd751baaa08465f6203a00410"
},
"downloads": -1,
"filename": "spider_ms-1.0.2-py3-none-any.whl",
"has_sig": false,
"md5_digest": "8237e4aec7826e218ec5b092722065de",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": ">=3.8",
"size": 12199,
"upload_time": "2024-06-22T05:44:10",
"upload_time_iso_8601": "2024-06-22T05:44:10.365318Z",
"url": "https://files.pythonhosted.org/packages/f2/73/064ac7b3d4e3cfd56a72d635a472354272871ef952a6b5c8445aa727f2c2/spider_ms-1.0.2-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": "",
"digests": {
"blake2b_256": "2d3459f93b57639bcb81c693c792633e0956457d26542c8ab298fd1e4072ea13",
"md5": "376bd9c13c414ab3e72307a7d164fa54",
"sha256": "4a38dd6797c15d18e265464b59dbcce20dc23150e527718ee2ea3fffc1556c51"
},
"downloads": -1,
"filename": "spider_ms-1.0.2.tar.gz",
"has_sig": false,
"md5_digest": "376bd9c13c414ab3e72307a7d164fa54",
"packagetype": "sdist",
"python_version": "source",
"requires_python": ">=3.8",
"size": 12513,
"upload_time": "2024-06-22T05:44:12",
"upload_time_iso_8601": "2024-06-22T05:44:12.962542Z",
"url": "https://files.pythonhosted.org/packages/2d/34/59f93b57639bcb81c693c792633e0956457d26542c8ab298fd1e4072ea13/spider_ms-1.0.2.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2024-06-22 05:44:12",
"github": false,
"gitlab": false,
"bitbucket": false,
"codeberg": false,
"lcname": "spider-ms"
}