Name | Zhihu-Spider JSON |
Version |
1.2.5
JSON |
| download |
home_page | https://github.com/yanjlee/Zhihu_Spider |
Summary | Scrapy the Zhihu content and user social network information. Now it contains 314400 questions and 261376 users.. |
upload_time | 2024-06-01 08:46:48 |
maintainer | None |
docs_url | None |
author | yanjlee |
requires_python | None |
license | None |
keywords |
|
VCS |
|
bugtrack_url |
|
requirements |
No requirements were recorded.
|
Travis-CI |
No Travis.
|
coveralls test coverage |
No coveralls.
|
Zhihu_Spider
============
Scrapy the Zhihu content and user social network information. Now it contains 314400 questions and 261376 users.
### File Strcture
* ./zhihu/zhihu : The related files about crawling the zhihu.com
* ./zhihu/zhihu_dat/ : The structured data for baseline experiments on zhihu dataset
* ./zhihu/zhihu_dat/item.dat: the corpus(bag of words) of all questions, using Blei’s LDA-C format. The line number represents qid
* ./zhihu/zhihu_dat/users.dat: the corpus of all users, the features of users is the bag representations of all the questions they have answered.
* ./zhihu/zhihu_dat/vocab.dat: the vocabulary of zhihu dataset
* ./zhihu/zhihu_dat/item_adj.dat: the questions and their answerer ids, the first column is the number of answers, the line number is question id
* ./zhihu/zhihu_dat/user_adj.dat: the users and their answered question ids, the line number the user id,
* ./zhihu/zhihu_dat/truth.dat: the questions and their answers, each answer has a score with them
Raw data
{
"_id": null,
"home_page": "https://github.com/yanjlee/Zhihu_Spider",
"name": "Zhihu-Spider",
"maintainer": null,
"docs_url": null,
"requires_python": null,
"maintainer_email": null,
"keywords": null,
"author": "yanjlee",
"author_email": "yanjlee@163.com",
"download_url": "https://files.pythonhosted.org/packages/cc/d7/3792be7ce93cc5bfe478c3248fd36f9075b5613b342d3f158b520a520886/zhihu_spider-1.2.5.tar.gz",
"platform": null,
"description": "Zhihu_Spider\r\n============\r\n\r\nScrapy the Zhihu content and user social network information. Now it contains 314400 questions and 261376 users.\r\n\r\n### File Strcture\r\n\r\n* ./zhihu/zhihu : The related files about crawling the zhihu.com\r\n* ./zhihu/zhihu_dat/ : The structured data for baseline experiments on zhihu dataset\r\n * ./zhihu/zhihu_dat/item.dat: the corpus(bag of words) of all questions, using Blei\u2019s LDA-C format. The line number represents qid\r\n * ./zhihu/zhihu_dat/users.dat: the corpus of all users, the features of users is the bag representations of all the questions they have answered.\r\n * ./zhihu/zhihu_dat/vocab.dat: the vocabulary of zhihu dataset\r\n * ./zhihu/zhihu_dat/item_adj.dat: the questions and their answerer ids, the first column is the number of answers, the line number is question id\r\n * ./zhihu/zhihu_dat/user_adj.dat: the users and their answered question ids, the line number the user id, \r\n * ./zhihu/zhihu_dat/truth.dat: the questions and their answers, each answer has a score with them\r\n\r\n",
"bugtrack_url": null,
"license": null,
"summary": "Scrapy the Zhihu content and user social network information. Now it contains 314400 questions and 261376 users..",
"version": "1.2.5",
"project_urls": {
"Homepage": "https://github.com/yanjlee/Zhihu_Spider"
},
"split_keywords": [],
"urls": [
{
"comment_text": "",
"digests": {
"blake2b_256": "50752e6b7877edd34aba1dd3de452fd57659cb3c4bf417008e65721f03381479",
"md5": "4d4fddebb773b8f2eb8f39f5d190cdd8",
"sha256": "4d108952dd0af43979f7e7e839dee130329268aa48bae0d787a2358f6bc2c2ff"
},
"downloads": -1,
"filename": "Zhihu_Spider-1.2.5-py3-none-any.whl",
"has_sig": false,
"md5_digest": "4d4fddebb773b8f2eb8f39f5d190cdd8",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": null,
"size": 18482,
"upload_time": "2024-06-01T08:46:46",
"upload_time_iso_8601": "2024-06-01T08:46:46.056988Z",
"url": "https://files.pythonhosted.org/packages/50/75/2e6b7877edd34aba1dd3de452fd57659cb3c4bf417008e65721f03381479/Zhihu_Spider-1.2.5-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": "",
"digests": {
"blake2b_256": "ccd73792be7ce93cc5bfe478c3248fd36f9075b5613b342d3f158b520a520886",
"md5": "608021e35a7c6df9b4b7834052dfd53f",
"sha256": "4bc21a87ac224b4531a5dcd8c27e10c8258664baa19d8bd41d4a92c249930aff"
},
"downloads": -1,
"filename": "zhihu_spider-1.2.5.tar.gz",
"has_sig": false,
"md5_digest": "608021e35a7c6df9b4b7834052dfd53f",
"packagetype": "sdist",
"python_version": "source",
"requires_python": null,
"size": 13837,
"upload_time": "2024-06-01T08:46:48",
"upload_time_iso_8601": "2024-06-01T08:46:48.176557Z",
"url": "https://files.pythonhosted.org/packages/cc/d7/3792be7ce93cc5bfe478c3248fd36f9075b5613b342d3f158b520a520886/zhihu_spider-1.2.5.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2024-06-01 08:46:48",
"github": true,
"gitlab": false,
"bitbucket": false,
"codeberg": false,
"github_user": "yanjlee",
"github_project": "Zhihu_Spider",
"travis_ci": false,
"coveralls": false,
"github_actions": false,
"lcname": "zhihu-spider"
}