# robotsparser
Python library that parses robots.txt files
## Functionalities
- Automatically discover all sitemap files
- Unzip gziped files
- Fetch all URLs from sitemaps
## Install
```
pip install robotsparser
```
## Usage
```python
from robotsparser.parser import Robotparser
robots_url = "https://www.example.com/robots.txt"
rb = Robotparser(url=robots_url, verbose=True)
# To initiate the crawl of sitemaps and indexed urls. sitemap_crawl_limit argument is optional
rb.read(fetch_sitemap_urls=True, sitemap_url_crawl_limit=5)
# Show information
rb.get_sitemap_indexes() # returns sitemap indexes
rb.get_sitemaps() # returns sitemaps
rb.get_urls() # returns a list of all urls
```
## Multiprocessing usage (crawl in the background)
Crawl in the background and output new entries to file
This is useful for sites where sitemaps are heavily nested and take a long
time to crawl
```python
from robotsparser.parser import Robotparser
import multiprocessing as mp
from sh import tail
if __name__ == '__main__':
mp.freeze_support()
robots_url = "https://www.example.com/robots.txt"
entries_log_file="./entries.log"
rb = Robotparser(url=robots_url, verbose=False, sitemap_entries_file=entries_log_file)
sitemap_crawl_proc = mp.Process(target = rb.read, kwargs = {'fetch_sitemap_urls': False})
sitemap_crawl_proc.start()
for line in tail("-f", entries_log_file, _iter=True):
print(line.replace("\n", ""))
if not sitemap_crawl_proc.is_alive():
break
```
Raw data
{
"_id": null,
"home_page": "https://github.com/Dvelezs94/robotsparser",
"name": "robotsparser",
"maintainer": "",
"docs_url": null,
"requires_python": "",
"maintainer_email": "",
"keywords": "robots.txt,sitemap,crawler,data mining",
"author": "Diego Velez",
"author_email": "diegovelezs94@gmail.com",
"download_url": "https://files.pythonhosted.org/packages/02/0f/4a8b23a919dd8bf0daa49851260b23a6af5c473ac43645310ed229bd3507/robotsparser-0.0.11.tar.gz",
"platform": null,
"description": "\n# robotsparser\nPython library that parses robots.txt files\n\n## Functionalities\n\n- Automatically discover all sitemap files\n- Unzip gziped files\n- Fetch all URLs from sitemaps\n\n## Install\n```\npip install robotsparser\n```\n\n## Usage\n\n```python\nfrom robotsparser.parser import Robotparser\n\nrobots_url = \"https://www.example.com/robots.txt\"\nrb = Robotparser(url=robots_url, verbose=True)\n# To initiate the crawl of sitemaps and indexed urls. sitemap_crawl_limit argument is optional\nrb.read(fetch_sitemap_urls=True, sitemap_url_crawl_limit=5)\n\n# Show information\nrb.get_sitemap_indexes() # returns sitemap indexes\nrb.get_sitemaps() # returns sitemaps\nrb.get_urls() # returns a list of all urls\n```\n\n## Multiprocessing usage (crawl in the background)\n\nCrawl in the background and output new entries to file\n\nThis is useful for sites where sitemaps are heavily nested and take a long\ntime to crawl\n\n```python\nfrom robotsparser.parser import Robotparser\nimport multiprocessing as mp\nfrom sh import tail\n\nif __name__ == '__main__':\n mp.freeze_support()\n robots_url = \"https://www.example.com/robots.txt\"\n entries_log_file=\"./entries.log\"\n rb = Robotparser(url=robots_url, verbose=False, sitemap_entries_file=entries_log_file)\n sitemap_crawl_proc = mp.Process(target = rb.read, kwargs = {'fetch_sitemap_urls': False})\n sitemap_crawl_proc.start()\n\n for line in tail(\"-f\", entries_log_file, _iter=True):\n print(line.replace(\"\\n\", \"\"))\n if not sitemap_crawl_proc.is_alive():\n break\n```\n",
"bugtrack_url": null,
"license": "",
"summary": "Parse robots.txt files and find indexed urls",
"version": "0.0.11",
"split_keywords": [
"robots.txt",
"sitemap",
"crawler",
"data mining"
],
"urls": [
{
"comment_text": "",
"digests": {
"blake2b_256": "1b7c31d3b2ecf0f10408770f3c011a5556dc6ed54c4b510eda0e8fa27e93bef6",
"md5": "f906fd2e2fe0f4c71ca8b7cfdea2b29c",
"sha256": "b24801a8ca98170e2af169911e6e824c1a6a3d48ac1383fae43127a1889d6259"
},
"downloads": -1,
"filename": "robotsparser-0.0.11-py3-none-any.whl",
"has_sig": false,
"md5_digest": "f906fd2e2fe0f4c71ca8b7cfdea2b29c",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": null,
"size": 5752,
"upload_time": "2023-02-07T22:05:27",
"upload_time_iso_8601": "2023-02-07T22:05:27.536426Z",
"url": "https://files.pythonhosted.org/packages/1b/7c/31d3b2ecf0f10408770f3c011a5556dc6ed54c4b510eda0e8fa27e93bef6/robotsparser-0.0.11-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": "",
"digests": {
"blake2b_256": "020f4a8b23a919dd8bf0daa49851260b23a6af5c473ac43645310ed229bd3507",
"md5": "68749ec3de64c3518f1182a20dde9349",
"sha256": "8ab1c2178136b600e68835fa4ed42443b41cc965fff9a78929a32746a731f551"
},
"downloads": -1,
"filename": "robotsparser-0.0.11.tar.gz",
"has_sig": false,
"md5_digest": "68749ec3de64c3518f1182a20dde9349",
"packagetype": "sdist",
"python_version": "source",
"requires_python": null,
"size": 5602,
"upload_time": "2023-02-07T22:05:28",
"upload_time_iso_8601": "2023-02-07T22:05:28.639643Z",
"url": "https://files.pythonhosted.org/packages/02/0f/4a8b23a919dd8bf0daa49851260b23a6af5c473ac43645310ed229bd3507/robotsparser-0.0.11.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2023-02-07 22:05:28",
"github": true,
"gitlab": false,
"bitbucket": false,
"github_user": "Dvelezs94",
"github_project": "robotsparser",
"travis_ci": false,
"coveralls": false,
"github_actions": true,
"requirements": [
{
"name": "beautifulsoup4",
"specs": [
[
"==",
"4.11.1"
]
]
},
{
"name": "bs4",
"specs": [
[
"==",
"0.0.1"
]
]
},
{
"name": "certifi",
"specs": [
[
"==",
"2022.12.7"
]
]
},
{
"name": "charset-normalizer",
"specs": [
[
"==",
"2.1.1"
]
]
},
{
"name": "idna",
"specs": [
[
"==",
"3.4"
]
]
},
{
"name": "lxml",
"specs": [
[
"==",
"4.9.2"
]
]
},
{
"name": "requests",
"specs": [
[
"==",
"2.28.1"
]
]
},
{
"name": "soupsieve",
"specs": [
[
"==",
"2.3.2.post1"
]
]
},
{
"name": "urllib3",
"specs": [
[
"==",
"1.26.13"
]
]
},
{
"name": "sh",
"specs": [
[
"==",
"1.14.3"
]
]
}
],
"lcname": "robotsparser"
}