# How To Install
`pip install HiveWebCrawler`
# v0.1.5 News
- With v0.1.5, erroneous scraping scenarios have been significantly reduced.
- Image scraping has been improved.
# Example:
## Sending requests:
```python
# Code
from HiveWebCrawler.Crawler import WebCrawler
CrawlerToolkit = WebCrawler()
request_data = CrawlerToolkit.send_request(target_url="https://google.com")
print(request_data.keys())
# Output
dict_keys(['success', 'message', 'url', 'status_code', 'timeout_val', 'method', 'data'])
```
## Crawling link from response:
```python
# import Crawler
from HiveWebCrawler.Crawler import WebCrawler
# toolkit init
CrawlerToolkit = WebCrawler()
# sending http/s requests
request_data = CrawlerToolkit.send_request(target_url="https://google.com")
# checking status
if not request_data["success"]:
print(request_data["message"])
exit(1)
# Crawling links
crawled_links = CrawlerToolkit.crawl_links_from_pesponse_href(
original_target_url="https://google.com", # For feedback
response_text=request_data["data"]
)
# checking status
if not crawled_links["success"]:
print(request_data["message"])
exit(1)
# print dict keys
print(crawled_links.keys())
# print crawled links
for single_list in crawled_links["data_array"]:
print(single_list)
# OUTPUT
dict_keys(['success', 'data_array', 'original_url', 'message']) # dict keys
# Crawled links
['https://www.google.com/imghp?hl=tr&tab=wi', None]
['https://maps.google.com.tr/maps?hl=tr&tab=wl', None]
['https://play.google.com/?hl=tr&tab=w8', None]
['https://www.youtube.com/?tab=w1', None]
['https://news.google.com/?tab=wn', None]
['https://mail.google.com/mail/?tab=wm', None]
['https://drive.google.com/?tab=wo', None]
['https://www.google.com.tr/intl/tr/about/products?tab=wh', None]
['http://www.google.com.tr/history/optout?hl=tr', None]
['https://google.com/preferences?hl=tr', None]
['https://accounts.google.com/ServiceLogin?hl=tr&passive=true&continue=https://www.google.com/&ec=GAZAAQ', None]
['https://google.com/advanced_search?hl=tr&authuser=0', None]
['https://google.com/intl/tr/ads/', None]
['http://www.google.com.tr/intl/tr/services/', None]
['https://google.com/intl/tr/about.html', None]
['https://www.google.com/setprefdomain?prefdom=TR&prev=https://www.google.com.tr/&sig=K_nBMpLM40cwVr7j5Oqk31t_0TCeo%3D', None]
['https://google.com/intl/tr/policies/privacy/', None]
['https://google.com/intl/tr/policies/terms/', None]
```
## Crawling Image From Response:
```python
# import Crawler
from HiveWebCrawler.Crawler import WebCrawler
# toolkit init
CrawlerToolkit = WebCrawler()
# sending http/s requests
request_data = CrawlerToolkit.send_request(target_url="https://google.com")
# checking status
if not request_data["success"]:
print(request_data["message"])
exit(1)
# Crawling Images
crawled_links = CrawlerToolkit.crawl_image_from_response(
original_url="https://google.com",
response_text=request_data["data"]
)
# checking status
if not crawled_links["success"]:
print(request_data["message"])
exit(1)
# print dict keys
print(crawled_links.keys())
# print crawled Images
for single_list in crawled_links["data_array"]:
print(single_list)
# OUTPUT
dict_keys(['success', 'data_array', 'original_url']) # dict keys
# Crawled Images
['https://google.com/images/branding/googlelogo/1x/googlelogo_white_background_color_272x92dp.png', 'Google', None]
['https://google.com/textinputassistant/tia.png', None, None]
```
## Crawling Phone Number OR Email address
### E-mail
```python
# import Crawler
from HiveWebCrawler.Crawler import WebCrawler
# toolkit init
CrawlerToolkit = WebCrawler()
# sending http/s requests
request_data = CrawlerToolkit.send_request(target_url="https://www.hurriyet.com.tr/bizeulasin/")
# checking status
if not request_data["success"]:
print(request_data["message"])
exit(1)
# Crawling email/s
crawled_links = CrawlerToolkit.crawl_email_address_from_response_href(response_text=request_data["data"])
# checking status
if not crawled_links["success"]:
print(request_data["message"])
exit(1)
# print dict keys
print(crawled_links.keys())
# print crawled email/s
for single_list in crawled_links["data_array"]:
print(single_list)
# OUTPUT
dict_keys(['success', 'data_array', 'message']) # dict keys
# Crawled emails
[None, 'CENSORED@hurriyet.com.tr']
```
### Phone Numbers
```python
# import Crawler
from HiveWebCrawler.Crawler import WebCrawler
# toolkit init
CrawlerToolkit = WebCrawler()
# sending http/s requests
request_data = CrawlerToolkit.send_request(target_url="https://www.hurriyet.com.tr/bizeulasin/")
# checking status
if not request_data["success"]:
print(request_data["message"])
exit(1)
# Crawling phone numbers
crawled_links = CrawlerToolkit.crawl_phone_number_from_response_href(response_text=request_data["data"])
# checking status
if not crawled_links["success"]:
print(request_data["message"])
exit(1)
# print dict keys
print(crawled_links.keys())
# print crawled phone numbers
for single_list in crawled_links["data_array"]:
print(single_list)
# OUTPUT
dict_keys(['success', 'data_array', 'message']) # dict keys
[None, '+90XXXXXXXXXXX'] # Crawled phone numbers
```
Raw data
{
"_id": null,
"home_page": "https://github.com/MehmetYukselSekeroglu/HiveWebCrawler",
"name": "HiveWebCrawler",
"maintainer": null,
"docs_url": null,
"requires_python": ">=3.8",
"maintainer_email": null,
"keywords": "PyPi, Web Crawler, Web Image Crawler, Web URL Crawler, Web Email Crawler, Web Phone Number Crawler",
"author": "MehmetYukselSekeroglu",
"author_email": "dijital_evren@protonmail.com",
"download_url": null,
"platform": null,
"description": "# How To Install\n\n`pip install HiveWebCrawler`\n\n\n# v0.1.5 News\n\n- With v0.1.5, erroneous scraping scenarios have been significantly reduced.\n- Image scraping has been improved.\n\n\n# Example:\n\n## Sending requests:\n\n```python\n# Code\nfrom HiveWebCrawler.Crawler import WebCrawler\nCrawlerToolkit = WebCrawler()\nrequest_data = CrawlerToolkit.send_request(target_url=\"https://google.com\")\nprint(request_data.keys())\n\n\n# Output \ndict_keys(['success', 'message', 'url', 'status_code', 'timeout_val', 'method', 'data'])\n```\n\n\n\n## Crawling link from response:\n\n```python\n# import Crawler\nfrom HiveWebCrawler.Crawler import WebCrawler\n\n# toolkit init\nCrawlerToolkit = WebCrawler()\n\n# sending http/s requests\nrequest_data = CrawlerToolkit.send_request(target_url=\"https://google.com\")\n\n# checking status\nif not request_data[\"success\"]:\n print(request_data[\"message\"])\n exit(1)\n \n# Crawling links\ncrawled_links = CrawlerToolkit.crawl_links_from_pesponse_href(\n original_target_url=\"https://google.com\", # For feedback\n response_text=request_data[\"data\"]\n )\n\n# checking status\nif not crawled_links[\"success\"]:\n print(request_data[\"message\"])\n exit(1)\n \n# print dict keys\nprint(crawled_links.keys())\n\n\n# print crawled links\nfor single_list in crawled_links[\"data_array\"]:\n print(single_list)\n\n\n\n# OUTPUT\n\ndict_keys(['success', 'data_array', 'original_url', 'message']) # dict keys\n\n# Crawled links\n['https://www.google.com/imghp?hl=tr&tab=wi', None]\n['https://maps.google.com.tr/maps?hl=tr&tab=wl', None]\n['https://play.google.com/?hl=tr&tab=w8', None]\n['https://www.youtube.com/?tab=w1', None]\n['https://news.google.com/?tab=wn', None]\n['https://mail.google.com/mail/?tab=wm', None]\n['https://drive.google.com/?tab=wo', None]\n['https://www.google.com.tr/intl/tr/about/products?tab=wh', None]\n['http://www.google.com.tr/history/optout?hl=tr', None]\n['https://google.com/preferences?hl=tr', None]\n['https://accounts.google.com/ServiceLogin?hl=tr&passive=true&continue=https://www.google.com/&ec=GAZAAQ', None]\n['https://google.com/advanced_search?hl=tr&authuser=0', None]\n['https://google.com/intl/tr/ads/', None]\n['http://www.google.com.tr/intl/tr/services/', None]\n['https://google.com/intl/tr/about.html', None]\n['https://www.google.com/setprefdomain?prefdom=TR&prev=https://www.google.com.tr/&sig=K_nBMpLM40cwVr7j5Oqk31t_0TCeo%3D', None]\n['https://google.com/intl/tr/policies/privacy/', None]\n['https://google.com/intl/tr/policies/terms/', None]\n \n```\n\n## Crawling Image From Response:\n\n\n```python\n# import Crawler\nfrom HiveWebCrawler.Crawler import WebCrawler\n\n# toolkit init\nCrawlerToolkit = WebCrawler()\n\n# sending http/s requests\nrequest_data = CrawlerToolkit.send_request(target_url=\"https://google.com\")\n\n# checking status\nif not request_data[\"success\"]:\n print(request_data[\"message\"])\n exit(1)\n \n# Crawling Images\ncrawled_links = CrawlerToolkit.crawl_image_from_response(\n original_url=\"https://google.com\",\n response_text=request_data[\"data\"]\n )\n\n\n# checking status\nif not crawled_links[\"success\"]:\n print(request_data[\"message\"])\n exit(1)\n \n# print dict keys\nprint(crawled_links.keys())\n\n\n# print crawled Images\nfor single_list in crawled_links[\"data_array\"]:\n print(single_list)\n\n\n# OUTPUT \n\ndict_keys(['success', 'data_array', 'original_url']) # dict keys\n\n# Crawled Images\n['https://google.com/images/branding/googlelogo/1x/googlelogo_white_background_color_272x92dp.png', 'Google', None]\n['https://google.com/textinputassistant/tia.png', None, None]\n\n```\n\n\n## Crawling Phone Number OR Email address\n\n\n### E-mail\n\n\n```python\n# import Crawler\nfrom HiveWebCrawler.Crawler import WebCrawler\n\n# toolkit init\nCrawlerToolkit = WebCrawler()\n\n# sending http/s requests\nrequest_data = CrawlerToolkit.send_request(target_url=\"https://www.hurriyet.com.tr/bizeulasin/\")\n\n# checking status\nif not request_data[\"success\"]:\n print(request_data[\"message\"])\n exit(1)\n \n# Crawling email/s\ncrawled_links = CrawlerToolkit.crawl_email_address_from_response_href(response_text=request_data[\"data\"])\n\n\n\n# checking status\nif not crawled_links[\"success\"]:\n print(request_data[\"message\"])\n exit(1)\n \n# print dict keys\nprint(crawled_links.keys())\n\n\n# print crawled email/s\nfor single_list in crawled_links[\"data_array\"]:\n print(single_list)\n\n\n# OUTPUT \n\ndict_keys(['success', 'data_array', 'message']) # dict keys\n\n# Crawled emails\n[None, 'CENSORED@hurriyet.com.tr']\n \n```\n\n### Phone Numbers\n\n\n```python\n# import Crawler\nfrom HiveWebCrawler.Crawler import WebCrawler\n\n# toolkit init\nCrawlerToolkit = WebCrawler()\n\n# sending http/s requests\nrequest_data = CrawlerToolkit.send_request(target_url=\"https://www.hurriyet.com.tr/bizeulasin/\")\n\n# checking status\nif not request_data[\"success\"]:\n print(request_data[\"message\"])\n exit(1)\n \n# Crawling phone numbers\ncrawled_links = CrawlerToolkit.crawl_phone_number_from_response_href(response_text=request_data[\"data\"])\n\n\n\n# checking status\nif not crawled_links[\"success\"]:\n print(request_data[\"message\"])\n exit(1)\n \n# print dict keys\nprint(crawled_links.keys())\n\n\n# print crawled phone numbers\nfor single_list in crawled_links[\"data_array\"]:\n print(single_list)\n\n\n# OUTPUT \ndict_keys(['success', 'data_array', 'message']) # dict keys\n[None, '+90XXXXXXXXXXX'] # Crawled phone numbers\n\n```\n\n",
"bugtrack_url": null,
"license": null,
"summary": "Python 3.x Web Crawler, Images, Urls, Emails, Phone numbers",
"version": "0.1.6",
"project_urls": {
"Bug Reports": "https://github.com/MehmetYukselSekeroglu/HiveWebCrawler/issues",
"Homepage": "https://github.com/MehmetYukselSekeroglu/HiveWebCrawler",
"Source Code": "https://github.com/MehmetYukselSekeroglu/HiveWebCrawler"
},
"split_keywords": [
"pypi",
" web crawler",
" web image crawler",
" web url crawler",
" web email crawler",
" web phone number crawler"
],
"urls": [
{
"comment_text": "",
"digests": {
"blake2b_256": "8655b64cc24a1857bb623125c0fc5e2ca954b1b676416102f13745da86fd33a1",
"md5": "093db8a30af426366144f6a4e877fc8e",
"sha256": "7a4a5ffb967d0328ce4ef1f8dcc64a2ee36a1b0767cee45b0b3ef27cdb0ebcb7"
},
"downloads": -1,
"filename": "HiveWebCrawler-0.1.6-py3-none-any.whl",
"has_sig": false,
"md5_digest": "093db8a30af426366144f6a4e877fc8e",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": ">=3.8",
"size": 8188,
"upload_time": "2024-06-07T10:47:15",
"upload_time_iso_8601": "2024-06-07T10:47:15.863890Z",
"url": "https://files.pythonhosted.org/packages/86/55/b64cc24a1857bb623125c0fc5e2ca954b1b676416102f13745da86fd33a1/HiveWebCrawler-0.1.6-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2024-06-07 10:47:15",
"github": true,
"gitlab": false,
"bitbucket": false,
"codeberg": false,
"github_user": "MehmetYukselSekeroglu",
"github_project": "HiveWebCrawler",
"travis_ci": false,
"coveralls": false,
"github_actions": false,
"lcname": "hivewebcrawler"
}