site2hdd


Namesite2hdd JSON
Version 0.15 PyPI version JSON
download
home_pagehttps://github.com/hansalemaos/site2hdd
SummaryDownload sites with public proxies - threading
upload_time2023-01-23 02:51:38
maintainer
docs_urlNone
authorJohannes Fischer
requires_python
licenseMIT
keywords download scrape site
VCS
bugtrack_url
requirements No requirements were recorded.
Travis-CI No Travis.
coveralls test coverage No coveralls.
            
```python

$pip install site2hdd

from site2hdd import download_url_list,get_proxies,download_webpage

```



```python

xlsxfile,pklfile = get_proxies(

  save_path_proxies_all_filtered='c:\\newfilepath\\myproxiefile\\proxy', #  path doesn't have to exist, it will be created, last 

 # part (proxy) is the name of the file - pkl and xlsx will be added

 # important: There will be 2 files, in this case: c:\\newfilepath\\myproxiefile\\proxy.pkl and c:\\newfilepath\\myproxiefile\\proxy.xlsx



  http_check_timeout=4, # if proxy can't connect within 4 seconds to wikipedia, it is invalid



  threads_httpcheck=50, # threads to check if the http connection is working



  threads_ping=100 ,  # before the http test, there is a ping test to check if the server exists



  silent=False, # show results when a working server has been found



  max_proxies_to_check=2000, # stops the search at 2000

)

```



## Downloading lists of free proxy servers



<img title="" src="https://github.com/hansalemaos/screenshots/raw/main/site2hdd/site2hdd00000000.png" alt="">



## Checking if the ip exists



<img title="" src="https://github.com/hansalemaos/screenshots/raw/main/site2hdd/site2hdd00000001.png" alt="">



## Checking if http works and own IP is hidden



<img title="" src="https://github.com/hansalemaos/screenshots/raw/main/site2hdd/site2hdd00000004.png" alt="">



```python

urls = [r'''https://pandas.pydata.org/docs/#''', r'''https://pandas.pydata.org/docs/getting_started/index.html''',

       r'''https://pandas.pydata.org/docs/user_guide/index.html''',

       r'''https://pandas.pydata.org/docs/reference/index.html''',

       r'''https://pandas.pydata.org/docs/development/index.html''',

       r'''https://pandas.pydata.org/docs/whatsnew/index.html''', r'''https://pandas.pydata.org/docs/dev/index.html''',

       r'''https://pandas.pydata.org/docs/index.html''',

       r'''https://pandas.pydata.org/pandas-docs/version/1.4/index.html''',

       r'''https://pandas.pydata.org/pandas-docs/version/1.3/index.html''',

       r'''https://pandas.pydata.org/pandas-docs/version/1.2/index.html''',

       r'''https://pandas.pydata.org/pandas-docs/version/1.1/index.html''',

       r'''https://pandas.pydata.org/pandas-docs/version/1.0/index.html''',

       r'''https://github.com/pandas-dev/pandas''', r'''https://twitter.com/pandas_dev''',

       r'''https://pandas.pydata.org/docs/#pandas-documentation''', r'''https://pandas.pydata.org/docs/pandas.zip''',

       r'''https://pandas.pydata.org/''', r'''https://pypi.org/project/pandas''',

       r'''https://github.com/pandas-dev/pandas/issues''', r'''https://stackoverflow.com/questions/tagged/pandas''',

       r'''https://groups.google.com/g/pydata''', r'''https://pandas.pydata.org/docs/#module-pandas''',

       r'''https://www.python.org/''',

       r'''https://pandas.pydata.org/docs/getting_started/index.html#getting-started''',

       r'''https://pandas.pydata.org/docs/user_guide/index.html#user-guide''',

       r'''https://pandas.pydata.org/docs/reference/index.html#api''',

       r'''https://pandas.pydata.org/docs/development/index.html#development''',

       r'''https://pandas.pydata.org/docs/_sources/index.rst.txt''', r'''https://numfocus.org/''',

       r'''https://www.ovhcloud.com/''', r'''http://sphinx-doc.org/''', ]



download_url_list(urls, ProxyPickleFile='c:\\newfilepath\\myproxiefile\\proxyn.pkl',

   # The file you created using the function: get_proxies 

   SaveFolder='f:\\testdlpandaslinks',  # where should the files be saved

   try_each_url_n_times=5,  # maximum retries for each url

   ProxyConfidenceLimit=10,

   # each link will be downloaded twice and then compared. If only one result is positive, it counts as a not successful download. But if     the ProxyConfidenceLimit is higher, then it will be accepted

   ThreadLimit=50,  # downloads at the same time

   RequestsTimeout=10,  # Timeout for requests

   ThreadTimeout=12,  # Should be a little higher than RequestsTimeout

   SleepAfterKillThread=0.1,  # Don't put 0.0 here - it will use too much CPU

   SleepAfterStartThread=0.1,  # Don't put 0.0 here - it will use too much CPU

   IgnoreExceptions=True, )

```



## Downloading a url list



### Never close the app when this is the last message that was printed: "Batch done - writing files to HDD ..."



<img title="" src="https://github.com/hansalemaos/screenshots/raw/main/site2hdd/site2hdd00000007.png" alt="">



```python

# downloads only links from one domain! All others are ignored!

starturls=[r'''https://pydata.org/upcoming-events/''',  # if it can't find links on the starting page, pass a list of links from the site. 

r'''https://pydata.org/past-events/''',

r'''https://pydata.org/organize-a-conference/''',

r'''https://pydata.org/start-a-meetup/''',

r'''https://pydata.org/volunteer/''',

r'''https://pydata.org/code-of-conduct/''',

r'''https://pydata.org/diversity-inclusion/''',

r'''https://pydata.org/wp-content/uploads/2022/03/PyData-2022-Sponsorship-Prospectus-v4-1.pdf''',

r'''https://pydata.org/sponsor-pydata/#''',

r'''https://pydata.org/faqs/''',

r'''https://pydata.org/''',

r'''https://pydata.org/about/''',

r'''https://pydata.org/sponsor-pydata/''',

r'''https://pydata.org/wp-content/uploads/2022/03/PyData-2022-Sponsorship-Prospectus-v4.pdf''',]

download_webpage(

      ProxyPickleFile='c:\\newfilepath\\myproxiefile\\proxyn.pkl',

      DomainName="pydata.org",

      DomainLink="https://pydata.org/",

      SaveFolder=r"f:\pandashomepagetest",

      ProxyConfidenceLimit=10,

      UrlsAtOnce=100,

      ThreadLimit=50,

      RequestsTimeout=10,

      ThreadTimeout=12,

      SleepAfterKillThread=0.1,

      SleepAfterStartThread=0.1,

      IgnoreExceptions=True,

      proxy_http_check_timeout=4,

      proxy_threads_httpcheck=65,

      proxy_threads_ping=100,

      proxy_silent=False,

      proxy_max_proxies_to_check=1000,

      starturls=starturls,

  )

```



## Downloading a whole page



<img title="" src="https://github.com/hansalemaos/screenshots/raw/main/site2hdd/site2hdd00000007.png" alt="">



```python

# Command line also works, but you can't use starturls, and the proxy.pkl has to exist already! 

# Best usage is to continue a download that hasn't been finished yet. 

# Existing files won't be downloaded again! 



import subprocess

import sys



subprocess.run(

    [

        sys.executable,

        r"C:\Users\USERNAME\anaconda3\envs\ENVNAME\Lib\site-packages\site2hdd\__init__.py",

        r"C:\Users\USERNAME\anaconda3\envs\ENVNAME\pandaspyd.ini",

    ]

)



# This is how ini files should look like 



r"""

[GENERAL]

ProxyPickleFile = c:\newfilepath\myproxiefile\proxyn.pkl

ProxyConfidenceLimit = 10

UrlsAtOnce = 100

; ThreadLimit - 50% of UrlsAtOnce is a good number 

ThreadLimit = 50  

RequestsTimeout = 10 

; ThreadTimeout - Should be a little higher than RequestsTimeout

ThreadTimeout = 12 

; SleepAfterKillThread - Don't put 0.0 here

SleepAfterKillThread = 0.1 

; SleepAfterStartThread - Don't put 0.0 here

SleepAfterStartThread = 0.1 

IgnoreExceptions = True

SaveFolder = f:\pythonsite

DomainName = python.org

DomainLink = https://www.python.org/

"""

```



## Downloading a whole page using the command line



<img title="" src="https://github.com/hansalemaos/screenshots/raw/main/site2hdd/site2hdd00000009.png" alt="">


            

Raw data

            {
    "_id": null,
    "home_page": "https://github.com/hansalemaos/site2hdd",
    "name": "site2hdd",
    "maintainer": "",
    "docs_url": null,
    "requires_python": "",
    "maintainer_email": "",
    "keywords": "download,scrape,site",
    "author": "Johannes Fischer",
    "author_email": "<aulasparticularesdealemaosp@gmail.com>",
    "download_url": "https://files.pythonhosted.org/packages/5b/ec/36c3de9601f033798cda75aa3dee7dd2dc6a6846c44dd673ccf74018db9d/site2hdd-0.15.tar.gz",
    "platform": null,
    "description": "\n```python\n\n$pip install site2hdd\n\nfrom site2hdd import download_url_list,get_proxies,download_webpage\n\n```\n\n\n\n```python\n\nxlsxfile,pklfile = get_proxies(\n\n  save_path_proxies_all_filtered='c:\\\\newfilepath\\\\myproxiefile\\\\proxy', #  path doesn't have to exist, it will be created, last \n\n # part (proxy) is the name of the file - pkl and xlsx will be added\n\n # important: There will be 2 files, in this case: c:\\\\newfilepath\\\\myproxiefile\\\\proxy.pkl and c:\\\\newfilepath\\\\myproxiefile\\\\proxy.xlsx\n\n\n\n  http_check_timeout=4, # if proxy can't connect within 4 seconds to wikipedia, it is invalid\n\n\n\n  threads_httpcheck=50, # threads to check if the http connection is working\n\n\n\n  threads_ping=100 ,  # before the http test, there is a ping test to check if the server exists\n\n\n\n  silent=False, # show results when a working server has been found\n\n\n\n  max_proxies_to_check=2000, # stops the search at 2000\n\n)\n\n```\n\n\n\n## Downloading lists of free proxy servers\n\n\n\n<img title=\"\" src=\"https://github.com/hansalemaos/screenshots/raw/main/site2hdd/site2hdd00000000.png\" alt=\"\">\n\n\n\n## Checking if the ip exists\n\n\n\n<img title=\"\" src=\"https://github.com/hansalemaos/screenshots/raw/main/site2hdd/site2hdd00000001.png\" alt=\"\">\n\n\n\n## Checking if http works and own IP is hidden\n\n\n\n<img title=\"\" src=\"https://github.com/hansalemaos/screenshots/raw/main/site2hdd/site2hdd00000004.png\" alt=\"\">\n\n\n\n```python\n\nurls = [r'''https://pandas.pydata.org/docs/#''', r'''https://pandas.pydata.org/docs/getting_started/index.html''',\n\n       r'''https://pandas.pydata.org/docs/user_guide/index.html''',\n\n       r'''https://pandas.pydata.org/docs/reference/index.html''',\n\n       r'''https://pandas.pydata.org/docs/development/index.html''',\n\n       r'''https://pandas.pydata.org/docs/whatsnew/index.html''', r'''https://pandas.pydata.org/docs/dev/index.html''',\n\n       r'''https://pandas.pydata.org/docs/index.html''',\n\n       r'''https://pandas.pydata.org/pandas-docs/version/1.4/index.html''',\n\n       r'''https://pandas.pydata.org/pandas-docs/version/1.3/index.html''',\n\n       r'''https://pandas.pydata.org/pandas-docs/version/1.2/index.html''',\n\n       r'''https://pandas.pydata.org/pandas-docs/version/1.1/index.html''',\n\n       r'''https://pandas.pydata.org/pandas-docs/version/1.0/index.html''',\n\n       r'''https://github.com/pandas-dev/pandas''', r'''https://twitter.com/pandas_dev''',\n\n       r'''https://pandas.pydata.org/docs/#pandas-documentation''', r'''https://pandas.pydata.org/docs/pandas.zip''',\n\n       r'''https://pandas.pydata.org/''', r'''https://pypi.org/project/pandas''',\n\n       r'''https://github.com/pandas-dev/pandas/issues''', r'''https://stackoverflow.com/questions/tagged/pandas''',\n\n       r'''https://groups.google.com/g/pydata''', r'''https://pandas.pydata.org/docs/#module-pandas''',\n\n       r'''https://www.python.org/''',\n\n       r'''https://pandas.pydata.org/docs/getting_started/index.html#getting-started''',\n\n       r'''https://pandas.pydata.org/docs/user_guide/index.html#user-guide''',\n\n       r'''https://pandas.pydata.org/docs/reference/index.html#api''',\n\n       r'''https://pandas.pydata.org/docs/development/index.html#development''',\n\n       r'''https://pandas.pydata.org/docs/_sources/index.rst.txt''', r'''https://numfocus.org/''',\n\n       r'''https://www.ovhcloud.com/''', r'''http://sphinx-doc.org/''', ]\n\n\n\ndownload_url_list(urls, ProxyPickleFile='c:\\\\newfilepath\\\\myproxiefile\\\\proxyn.pkl',\n\n   # The file you created using the function: get_proxies \n\n   SaveFolder='f:\\\\testdlpandaslinks',  # where should the files be saved\n\n   try_each_url_n_times=5,  # maximum retries for each url\n\n   ProxyConfidenceLimit=10,\n\n   # each link will be downloaded twice and then compared. If only one result is positive, it counts as a not successful download. But if     the ProxyConfidenceLimit is higher, then it will be accepted\n\n   ThreadLimit=50,  # downloads at the same time\n\n   RequestsTimeout=10,  # Timeout for requests\n\n   ThreadTimeout=12,  # Should be a little higher than RequestsTimeout\n\n   SleepAfterKillThread=0.1,  # Don't put 0.0 here - it will use too much CPU\n\n   SleepAfterStartThread=0.1,  # Don't put 0.0 here - it will use too much CPU\n\n   IgnoreExceptions=True, )\n\n```\n\n\n\n## Downloading a url list\n\n\n\n### Never close the app when this is the last message that was printed: \"Batch done - writing files to HDD ...\"\n\n\n\n<img title=\"\" src=\"https://github.com/hansalemaos/screenshots/raw/main/site2hdd/site2hdd00000007.png\" alt=\"\">\n\n\n\n```python\n\n# downloads only links from one domain! All others are ignored!\n\nstarturls=[r'''https://pydata.org/upcoming-events/''',  # if it can't find links on the starting page, pass a list of links from the site. \n\nr'''https://pydata.org/past-events/''',\n\nr'''https://pydata.org/organize-a-conference/''',\n\nr'''https://pydata.org/start-a-meetup/''',\n\nr'''https://pydata.org/volunteer/''',\n\nr'''https://pydata.org/code-of-conduct/''',\n\nr'''https://pydata.org/diversity-inclusion/''',\n\nr'''https://pydata.org/wp-content/uploads/2022/03/PyData-2022-Sponsorship-Prospectus-v4-1.pdf''',\n\nr'''https://pydata.org/sponsor-pydata/#''',\n\nr'''https://pydata.org/faqs/''',\n\nr'''https://pydata.org/''',\n\nr'''https://pydata.org/about/''',\n\nr'''https://pydata.org/sponsor-pydata/''',\n\nr'''https://pydata.org/wp-content/uploads/2022/03/PyData-2022-Sponsorship-Prospectus-v4.pdf''',]\n\ndownload_webpage(\n\n      ProxyPickleFile='c:\\\\newfilepath\\\\myproxiefile\\\\proxyn.pkl',\n\n      DomainName=\"pydata.org\",\n\n      DomainLink=\"https://pydata.org/\",\n\n      SaveFolder=r\"f:\\pandashomepagetest\",\n\n      ProxyConfidenceLimit=10,\n\n      UrlsAtOnce=100,\n\n      ThreadLimit=50,\n\n      RequestsTimeout=10,\n\n      ThreadTimeout=12,\n\n      SleepAfterKillThread=0.1,\n\n      SleepAfterStartThread=0.1,\n\n      IgnoreExceptions=True,\n\n      proxy_http_check_timeout=4,\n\n      proxy_threads_httpcheck=65,\n\n      proxy_threads_ping=100,\n\n      proxy_silent=False,\n\n      proxy_max_proxies_to_check=1000,\n\n      starturls=starturls,\n\n  )\n\n```\n\n\n\n## Downloading a whole page\n\n\n\n<img title=\"\" src=\"https://github.com/hansalemaos/screenshots/raw/main/site2hdd/site2hdd00000007.png\" alt=\"\">\n\n\n\n```python\n\n# Command line also works, but you can't use starturls, and the proxy.pkl has to exist already! \n\n# Best usage is to continue a download that hasn't been finished yet. \n\n# Existing files won't be downloaded again! \n\n\n\nimport subprocess\n\nimport sys\n\n\n\nsubprocess.run(\n\n    [\n\n        sys.executable,\n\n        r\"C:\\Users\\USERNAME\\anaconda3\\envs\\ENVNAME\\Lib\\site-packages\\site2hdd\\__init__.py\",\n\n        r\"C:\\Users\\USERNAME\\anaconda3\\envs\\ENVNAME\\pandaspyd.ini\",\n\n    ]\n\n)\n\n\n\n# This is how ini files should look like \n\n\n\nr\"\"\"\n\n[GENERAL]\n\nProxyPickleFile = c:\\newfilepath\\myproxiefile\\proxyn.pkl\n\nProxyConfidenceLimit = 10\n\nUrlsAtOnce = 100\n\n; ThreadLimit - 50% of UrlsAtOnce is a good number \n\nThreadLimit = 50  \n\nRequestsTimeout = 10 \n\n; ThreadTimeout - Should be a little higher than RequestsTimeout\n\nThreadTimeout = 12 \n\n; SleepAfterKillThread - Don't put 0.0 here\n\nSleepAfterKillThread = 0.1 \n\n; SleepAfterStartThread - Don't put 0.0 here\n\nSleepAfterStartThread = 0.1 \n\nIgnoreExceptions = True\n\nSaveFolder = f:\\pythonsite\n\nDomainName = python.org\n\nDomainLink = https://www.python.org/\n\n\"\"\"\n\n```\n\n\n\n## Downloading a whole page using the command line\n\n\n\n<img title=\"\" src=\"https://github.com/hansalemaos/screenshots/raw/main/site2hdd/site2hdd00000009.png\" alt=\"\">\n\n",
    "bugtrack_url": null,
    "license": "MIT",
    "summary": "Download sites with public proxies - threading",
    "version": "0.15",
    "split_keywords": [
        "download",
        "scrape",
        "site"
    ],
    "urls": [
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "aeacf49389badd7aad059a766c4271fbac993c46ce416c97e0436b3e8f9df479",
                "md5": "3cedeb87ccda626665c833438681aa00",
                "sha256": "5d0e93aff04e9d37214fcdddebdd27f0ddc87e1699f3bb61c7738b2acc444e4e"
            },
            "downloads": -1,
            "filename": "site2hdd-0.15-py3-none-any.whl",
            "has_sig": false,
            "md5_digest": "3cedeb87ccda626665c833438681aa00",
            "packagetype": "bdist_wheel",
            "python_version": "py3",
            "requires_python": null,
            "size": 13300,
            "upload_time": "2023-01-23T02:51:36",
            "upload_time_iso_8601": "2023-01-23T02:51:36.636944Z",
            "url": "https://files.pythonhosted.org/packages/ae/ac/f49389badd7aad059a766c4271fbac993c46ce416c97e0436b3e8f9df479/site2hdd-0.15-py3-none-any.whl",
            "yanked": false,
            "yanked_reason": null
        },
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "5bec36c3de9601f033798cda75aa3dee7dd2dc6a6846c44dd673ccf74018db9d",
                "md5": "f28e1fd630b8e7813ce003b19638d88d",
                "sha256": "6e9eef1d5d165d875c903ecae92a620efaac1aa2446885192efc7fa79449ce0f"
            },
            "downloads": -1,
            "filename": "site2hdd-0.15.tar.gz",
            "has_sig": false,
            "md5_digest": "f28e1fd630b8e7813ce003b19638d88d",
            "packagetype": "sdist",
            "python_version": "source",
            "requires_python": null,
            "size": 12792,
            "upload_time": "2023-01-23T02:51:38",
            "upload_time_iso_8601": "2023-01-23T02:51:38.323086Z",
            "url": "https://files.pythonhosted.org/packages/5b/ec/36c3de9601f033798cda75aa3dee7dd2dc6a6846c44dd673ccf74018db9d/site2hdd-0.15.tar.gz",
            "yanked": false,
            "yanked_reason": null
        }
    ],
    "upload_time": "2023-01-23 02:51:38",
    "github": true,
    "gitlab": false,
    "bitbucket": false,
    "github_user": "hansalemaos",
    "github_project": "site2hdd",
    "travis_ci": false,
    "coveralls": false,
    "github_actions": false,
    "requirements": [],
    "lcname": "site2hdd"
}
        
Elapsed time: 0.03336s