```python
$pip install site2hdd
from site2hdd import download_url_list,get_proxies,download_webpage
```
```python
xlsxfile,pklfile = get_proxies(
save_path_proxies_all_filtered='c:\\newfilepath\\myproxiefile\\proxy', # path doesn't have to exist, it will be created, last
# part (proxy) is the name of the file - pkl and xlsx will be added
# important: There will be 2 files, in this case: c:\\newfilepath\\myproxiefile\\proxy.pkl and c:\\newfilepath\\myproxiefile\\proxy.xlsx
http_check_timeout=4, # if proxy can't connect within 4 seconds to wikipedia, it is invalid
threads_httpcheck=50, # threads to check if the http connection is working
threads_ping=100 , # before the http test, there is a ping test to check if the server exists
silent=False, # show results when a working server has been found
max_proxies_to_check=2000, # stops the search at 2000
)
```
## Downloading lists of free proxy servers
<img title="" src="https://github.com/hansalemaos/screenshots/raw/main/site2hdd/site2hdd00000000.png" alt="">
## Checking if the ip exists
<img title="" src="https://github.com/hansalemaos/screenshots/raw/main/site2hdd/site2hdd00000001.png" alt="">
## Checking if http works and own IP is hidden
<img title="" src="https://github.com/hansalemaos/screenshots/raw/main/site2hdd/site2hdd00000004.png" alt="">
```python
urls = [r'''https://pandas.pydata.org/docs/#''', r'''https://pandas.pydata.org/docs/getting_started/index.html''',
r'''https://pandas.pydata.org/docs/user_guide/index.html''',
r'''https://pandas.pydata.org/docs/reference/index.html''',
r'''https://pandas.pydata.org/docs/development/index.html''',
r'''https://pandas.pydata.org/docs/whatsnew/index.html''', r'''https://pandas.pydata.org/docs/dev/index.html''',
r'''https://pandas.pydata.org/docs/index.html''',
r'''https://pandas.pydata.org/pandas-docs/version/1.4/index.html''',
r'''https://pandas.pydata.org/pandas-docs/version/1.3/index.html''',
r'''https://pandas.pydata.org/pandas-docs/version/1.2/index.html''',
r'''https://pandas.pydata.org/pandas-docs/version/1.1/index.html''',
r'''https://pandas.pydata.org/pandas-docs/version/1.0/index.html''',
r'''https://github.com/pandas-dev/pandas''', r'''https://twitter.com/pandas_dev''',
r'''https://pandas.pydata.org/docs/#pandas-documentation''', r'''https://pandas.pydata.org/docs/pandas.zip''',
r'''https://pandas.pydata.org/''', r'''https://pypi.org/project/pandas''',
r'''https://github.com/pandas-dev/pandas/issues''', r'''https://stackoverflow.com/questions/tagged/pandas''',
r'''https://groups.google.com/g/pydata''', r'''https://pandas.pydata.org/docs/#module-pandas''',
r'''https://www.python.org/''',
r'''https://pandas.pydata.org/docs/getting_started/index.html#getting-started''',
r'''https://pandas.pydata.org/docs/user_guide/index.html#user-guide''',
r'''https://pandas.pydata.org/docs/reference/index.html#api''',
r'''https://pandas.pydata.org/docs/development/index.html#development''',
r'''https://pandas.pydata.org/docs/_sources/index.rst.txt''', r'''https://numfocus.org/''',
r'''https://www.ovhcloud.com/''', r'''http://sphinx-doc.org/''', ]
download_url_list(urls, ProxyPickleFile='c:\\newfilepath\\myproxiefile\\proxyn.pkl',
# The file you created using the function: get_proxies
SaveFolder='f:\\testdlpandaslinks', # where should the files be saved
try_each_url_n_times=5, # maximum retries for each url
ProxyConfidenceLimit=10,
# each link will be downloaded twice and then compared. If only one result is positive, it counts as a not successful download. But if the ProxyConfidenceLimit is higher, then it will be accepted
ThreadLimit=50, # downloads at the same time
RequestsTimeout=10, # Timeout for requests
ThreadTimeout=12, # Should be a little higher than RequestsTimeout
SleepAfterKillThread=0.1, # Don't put 0.0 here - it will use too much CPU
SleepAfterStartThread=0.1, # Don't put 0.0 here - it will use too much CPU
IgnoreExceptions=True, )
```
## Downloading a url list
### Never close the app when this is the last message that was printed: "Batch done - writing files to HDD ..."
<img title="" src="https://github.com/hansalemaos/screenshots/raw/main/site2hdd/site2hdd00000007.png" alt="">
```python
# downloads only links from one domain! All others are ignored!
starturls=[r'''https://pydata.org/upcoming-events/''', # if it can't find links on the starting page, pass a list of links from the site.
r'''https://pydata.org/past-events/''',
r'''https://pydata.org/organize-a-conference/''',
r'''https://pydata.org/start-a-meetup/''',
r'''https://pydata.org/volunteer/''',
r'''https://pydata.org/code-of-conduct/''',
r'''https://pydata.org/diversity-inclusion/''',
r'''https://pydata.org/wp-content/uploads/2022/03/PyData-2022-Sponsorship-Prospectus-v4-1.pdf''',
r'''https://pydata.org/sponsor-pydata/#''',
r'''https://pydata.org/faqs/''',
r'''https://pydata.org/''',
r'''https://pydata.org/about/''',
r'''https://pydata.org/sponsor-pydata/''',
r'''https://pydata.org/wp-content/uploads/2022/03/PyData-2022-Sponsorship-Prospectus-v4.pdf''',]
download_webpage(
ProxyPickleFile='c:\\newfilepath\\myproxiefile\\proxyn.pkl',
DomainName="pydata.org",
DomainLink="https://pydata.org/",
SaveFolder=r"f:\pandashomepagetest",
ProxyConfidenceLimit=10,
UrlsAtOnce=100,
ThreadLimit=50,
RequestsTimeout=10,
ThreadTimeout=12,
SleepAfterKillThread=0.1,
SleepAfterStartThread=0.1,
IgnoreExceptions=True,
proxy_http_check_timeout=4,
proxy_threads_httpcheck=65,
proxy_threads_ping=100,
proxy_silent=False,
proxy_max_proxies_to_check=1000,
starturls=starturls,
)
```
## Downloading a whole page
<img title="" src="https://github.com/hansalemaos/screenshots/raw/main/site2hdd/site2hdd00000007.png" alt="">
```python
# Command line also works, but you can't use starturls, and the proxy.pkl has to exist already!
# Best usage is to continue a download that hasn't been finished yet.
# Existing files won't be downloaded again!
import subprocess
import sys
subprocess.run(
[
sys.executable,
r"C:\Users\USERNAME\anaconda3\envs\ENVNAME\Lib\site-packages\site2hdd\__init__.py",
r"C:\Users\USERNAME\anaconda3\envs\ENVNAME\pandaspyd.ini",
]
)
# This is how ini files should look like
r"""
[GENERAL]
ProxyPickleFile = c:\newfilepath\myproxiefile\proxyn.pkl
ProxyConfidenceLimit = 10
UrlsAtOnce = 100
; ThreadLimit - 50% of UrlsAtOnce is a good number
ThreadLimit = 50
RequestsTimeout = 10
; ThreadTimeout - Should be a little higher than RequestsTimeout
ThreadTimeout = 12
; SleepAfterKillThread - Don't put 0.0 here
SleepAfterKillThread = 0.1
; SleepAfterStartThread - Don't put 0.0 here
SleepAfterStartThread = 0.1
IgnoreExceptions = True
SaveFolder = f:\pythonsite
DomainName = python.org
DomainLink = https://www.python.org/
"""
```
## Downloading a whole page using the command line
<img title="" src="https://github.com/hansalemaos/screenshots/raw/main/site2hdd/site2hdd00000009.png" alt="">
Raw data
{
"_id": null,
"home_page": "https://github.com/hansalemaos/site2hdd",
"name": "site2hdd",
"maintainer": "",
"docs_url": null,
"requires_python": "",
"maintainer_email": "",
"keywords": "download,scrape,site",
"author": "Johannes Fischer",
"author_email": "<aulasparticularesdealemaosp@gmail.com>",
"download_url": "https://files.pythonhosted.org/packages/5b/ec/36c3de9601f033798cda75aa3dee7dd2dc6a6846c44dd673ccf74018db9d/site2hdd-0.15.tar.gz",
"platform": null,
"description": "\n```python\n\n$pip install site2hdd\n\nfrom site2hdd import download_url_list,get_proxies,download_webpage\n\n```\n\n\n\n```python\n\nxlsxfile,pklfile = get_proxies(\n\n save_path_proxies_all_filtered='c:\\\\newfilepath\\\\myproxiefile\\\\proxy', # path doesn't have to exist, it will be created, last \n\n # part (proxy) is the name of the file - pkl and xlsx will be added\n\n # important: There will be 2 files, in this case: c:\\\\newfilepath\\\\myproxiefile\\\\proxy.pkl and c:\\\\newfilepath\\\\myproxiefile\\\\proxy.xlsx\n\n\n\n http_check_timeout=4, # if proxy can't connect within 4 seconds to wikipedia, it is invalid\n\n\n\n threads_httpcheck=50, # threads to check if the http connection is working\n\n\n\n threads_ping=100 , # before the http test, there is a ping test to check if the server exists\n\n\n\n silent=False, # show results when a working server has been found\n\n\n\n max_proxies_to_check=2000, # stops the search at 2000\n\n)\n\n```\n\n\n\n## Downloading lists of free proxy servers\n\n\n\n<img title=\"\" src=\"https://github.com/hansalemaos/screenshots/raw/main/site2hdd/site2hdd00000000.png\" alt=\"\">\n\n\n\n## Checking if the ip exists\n\n\n\n<img title=\"\" src=\"https://github.com/hansalemaos/screenshots/raw/main/site2hdd/site2hdd00000001.png\" alt=\"\">\n\n\n\n## Checking if http works and own IP is hidden\n\n\n\n<img title=\"\" src=\"https://github.com/hansalemaos/screenshots/raw/main/site2hdd/site2hdd00000004.png\" alt=\"\">\n\n\n\n```python\n\nurls = [r'''https://pandas.pydata.org/docs/#''', r'''https://pandas.pydata.org/docs/getting_started/index.html''',\n\n r'''https://pandas.pydata.org/docs/user_guide/index.html''',\n\n r'''https://pandas.pydata.org/docs/reference/index.html''',\n\n r'''https://pandas.pydata.org/docs/development/index.html''',\n\n r'''https://pandas.pydata.org/docs/whatsnew/index.html''', r'''https://pandas.pydata.org/docs/dev/index.html''',\n\n r'''https://pandas.pydata.org/docs/index.html''',\n\n r'''https://pandas.pydata.org/pandas-docs/version/1.4/index.html''',\n\n r'''https://pandas.pydata.org/pandas-docs/version/1.3/index.html''',\n\n r'''https://pandas.pydata.org/pandas-docs/version/1.2/index.html''',\n\n r'''https://pandas.pydata.org/pandas-docs/version/1.1/index.html''',\n\n r'''https://pandas.pydata.org/pandas-docs/version/1.0/index.html''',\n\n r'''https://github.com/pandas-dev/pandas''', r'''https://twitter.com/pandas_dev''',\n\n r'''https://pandas.pydata.org/docs/#pandas-documentation''', r'''https://pandas.pydata.org/docs/pandas.zip''',\n\n r'''https://pandas.pydata.org/''', r'''https://pypi.org/project/pandas''',\n\n r'''https://github.com/pandas-dev/pandas/issues''', r'''https://stackoverflow.com/questions/tagged/pandas''',\n\n r'''https://groups.google.com/g/pydata''', r'''https://pandas.pydata.org/docs/#module-pandas''',\n\n r'''https://www.python.org/''',\n\n r'''https://pandas.pydata.org/docs/getting_started/index.html#getting-started''',\n\n r'''https://pandas.pydata.org/docs/user_guide/index.html#user-guide''',\n\n r'''https://pandas.pydata.org/docs/reference/index.html#api''',\n\n r'''https://pandas.pydata.org/docs/development/index.html#development''',\n\n r'''https://pandas.pydata.org/docs/_sources/index.rst.txt''', r'''https://numfocus.org/''',\n\n r'''https://www.ovhcloud.com/''', r'''http://sphinx-doc.org/''', ]\n\n\n\ndownload_url_list(urls, ProxyPickleFile='c:\\\\newfilepath\\\\myproxiefile\\\\proxyn.pkl',\n\n # The file you created using the function: get_proxies \n\n SaveFolder='f:\\\\testdlpandaslinks', # where should the files be saved\n\n try_each_url_n_times=5, # maximum retries for each url\n\n ProxyConfidenceLimit=10,\n\n # each link will be downloaded twice and then compared. If only one result is positive, it counts as a not successful download. But if the ProxyConfidenceLimit is higher, then it will be accepted\n\n ThreadLimit=50, # downloads at the same time\n\n RequestsTimeout=10, # Timeout for requests\n\n ThreadTimeout=12, # Should be a little higher than RequestsTimeout\n\n SleepAfterKillThread=0.1, # Don't put 0.0 here - it will use too much CPU\n\n SleepAfterStartThread=0.1, # Don't put 0.0 here - it will use too much CPU\n\n IgnoreExceptions=True, )\n\n```\n\n\n\n## Downloading a url list\n\n\n\n### Never close the app when this is the last message that was printed: \"Batch done - writing files to HDD ...\"\n\n\n\n<img title=\"\" src=\"https://github.com/hansalemaos/screenshots/raw/main/site2hdd/site2hdd00000007.png\" alt=\"\">\n\n\n\n```python\n\n# downloads only links from one domain! All others are ignored!\n\nstarturls=[r'''https://pydata.org/upcoming-events/''', # if it can't find links on the starting page, pass a list of links from the site. \n\nr'''https://pydata.org/past-events/''',\n\nr'''https://pydata.org/organize-a-conference/''',\n\nr'''https://pydata.org/start-a-meetup/''',\n\nr'''https://pydata.org/volunteer/''',\n\nr'''https://pydata.org/code-of-conduct/''',\n\nr'''https://pydata.org/diversity-inclusion/''',\n\nr'''https://pydata.org/wp-content/uploads/2022/03/PyData-2022-Sponsorship-Prospectus-v4-1.pdf''',\n\nr'''https://pydata.org/sponsor-pydata/#''',\n\nr'''https://pydata.org/faqs/''',\n\nr'''https://pydata.org/''',\n\nr'''https://pydata.org/about/''',\n\nr'''https://pydata.org/sponsor-pydata/''',\n\nr'''https://pydata.org/wp-content/uploads/2022/03/PyData-2022-Sponsorship-Prospectus-v4.pdf''',]\n\ndownload_webpage(\n\n ProxyPickleFile='c:\\\\newfilepath\\\\myproxiefile\\\\proxyn.pkl',\n\n DomainName=\"pydata.org\",\n\n DomainLink=\"https://pydata.org/\",\n\n SaveFolder=r\"f:\\pandashomepagetest\",\n\n ProxyConfidenceLimit=10,\n\n UrlsAtOnce=100,\n\n ThreadLimit=50,\n\n RequestsTimeout=10,\n\n ThreadTimeout=12,\n\n SleepAfterKillThread=0.1,\n\n SleepAfterStartThread=0.1,\n\n IgnoreExceptions=True,\n\n proxy_http_check_timeout=4,\n\n proxy_threads_httpcheck=65,\n\n proxy_threads_ping=100,\n\n proxy_silent=False,\n\n proxy_max_proxies_to_check=1000,\n\n starturls=starturls,\n\n )\n\n```\n\n\n\n## Downloading a whole page\n\n\n\n<img title=\"\" src=\"https://github.com/hansalemaos/screenshots/raw/main/site2hdd/site2hdd00000007.png\" alt=\"\">\n\n\n\n```python\n\n# Command line also works, but you can't use starturls, and the proxy.pkl has to exist already! \n\n# Best usage is to continue a download that hasn't been finished yet. \n\n# Existing files won't be downloaded again! \n\n\n\nimport subprocess\n\nimport sys\n\n\n\nsubprocess.run(\n\n [\n\n sys.executable,\n\n r\"C:\\Users\\USERNAME\\anaconda3\\envs\\ENVNAME\\Lib\\site-packages\\site2hdd\\__init__.py\",\n\n r\"C:\\Users\\USERNAME\\anaconda3\\envs\\ENVNAME\\pandaspyd.ini\",\n\n ]\n\n)\n\n\n\n# This is how ini files should look like \n\n\n\nr\"\"\"\n\n[GENERAL]\n\nProxyPickleFile = c:\\newfilepath\\myproxiefile\\proxyn.pkl\n\nProxyConfidenceLimit = 10\n\nUrlsAtOnce = 100\n\n; ThreadLimit - 50% of UrlsAtOnce is a good number \n\nThreadLimit = 50 \n\nRequestsTimeout = 10 \n\n; ThreadTimeout - Should be a little higher than RequestsTimeout\n\nThreadTimeout = 12 \n\n; SleepAfterKillThread - Don't put 0.0 here\n\nSleepAfterKillThread = 0.1 \n\n; SleepAfterStartThread - Don't put 0.0 here\n\nSleepAfterStartThread = 0.1 \n\nIgnoreExceptions = True\n\nSaveFolder = f:\\pythonsite\n\nDomainName = python.org\n\nDomainLink = https://www.python.org/\n\n\"\"\"\n\n```\n\n\n\n## Downloading a whole page using the command line\n\n\n\n<img title=\"\" src=\"https://github.com/hansalemaos/screenshots/raw/main/site2hdd/site2hdd00000009.png\" alt=\"\">\n\n",
"bugtrack_url": null,
"license": "MIT",
"summary": "Download sites with public proxies - threading",
"version": "0.15",
"split_keywords": [
"download",
"scrape",
"site"
],
"urls": [
{
"comment_text": "",
"digests": {
"blake2b_256": "aeacf49389badd7aad059a766c4271fbac993c46ce416c97e0436b3e8f9df479",
"md5": "3cedeb87ccda626665c833438681aa00",
"sha256": "5d0e93aff04e9d37214fcdddebdd27f0ddc87e1699f3bb61c7738b2acc444e4e"
},
"downloads": -1,
"filename": "site2hdd-0.15-py3-none-any.whl",
"has_sig": false,
"md5_digest": "3cedeb87ccda626665c833438681aa00",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": null,
"size": 13300,
"upload_time": "2023-01-23T02:51:36",
"upload_time_iso_8601": "2023-01-23T02:51:36.636944Z",
"url": "https://files.pythonhosted.org/packages/ae/ac/f49389badd7aad059a766c4271fbac993c46ce416c97e0436b3e8f9df479/site2hdd-0.15-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": "",
"digests": {
"blake2b_256": "5bec36c3de9601f033798cda75aa3dee7dd2dc6a6846c44dd673ccf74018db9d",
"md5": "f28e1fd630b8e7813ce003b19638d88d",
"sha256": "6e9eef1d5d165d875c903ecae92a620efaac1aa2446885192efc7fa79449ce0f"
},
"downloads": -1,
"filename": "site2hdd-0.15.tar.gz",
"has_sig": false,
"md5_digest": "f28e1fd630b8e7813ce003b19638d88d",
"packagetype": "sdist",
"python_version": "source",
"requires_python": null,
"size": 12792,
"upload_time": "2023-01-23T02:51:38",
"upload_time_iso_8601": "2023-01-23T02:51:38.323086Z",
"url": "https://files.pythonhosted.org/packages/5b/ec/36c3de9601f033798cda75aa3dee7dd2dc6a6846c44dd673ccf74018db9d/site2hdd-0.15.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2023-01-23 02:51:38",
"github": true,
"gitlab": false,
"bitbucket": false,
"github_user": "hansalemaos",
"github_project": "site2hdd",
"travis_ci": false,
"coveralls": false,
"github_actions": false,
"requirements": [],
"lcname": "site2hdd"
}