robobrowser


Namerobobrowser JSON
Version 0.5.3 PyPI version JSON
download
home_pagehttps://github.com/jmcarp/robobrowser
SummaryYour friendly neighborhood web scraper
upload_time2015-06-07 19:47:05
maintainerNone
docs_urlNone
authorJoshua Carp
requires_pythonNone
licenseMIT
keywords robobrowser
VCS
bugtrack_url
requirements No requirements were recorded.
Travis-CI
coveralls test coverage
            RoboBrowser: Your friendly neighborhood web scraper
===============================================

.. image:: https://badge.fury.io/py/robobrowser.png
    :target: http://badge.fury.io/py/robobrowser

.. image:: https://travis-ci.org/jmcarp/robobrowser.png?branch=master
        :target: https://travis-ci.org/jmcarp/robobrowser

.. image:: https://coveralls.io/repos/jmcarp/robobrowser/badge.png?branch=master
        :target: https://coveralls.io/r/jmcarp/robobrowser

Homepage: `http://robobrowser.readthedocs.org/ <http://robobrowser.readthedocs.org/>`_

RoboBrowser is a simple, Pythonic library for browsing the web without a standalone web browser. RoboBrowser
can fetch a page, click on links and buttons, and fill out and submit forms. If you need to interact with web services
that don't have APIs, RoboBrowser can help.

.. code-block:: python

    import re
    from robobrowser import RoboBrowser

    # Browse to Genius
    browser = RoboBrowser(history=True)
    browser.open('http://genius.com/')

    # Search for Porcupine Tree
    form = browser.get_form(action='/search')
    form                # <RoboForm q=>
    form['q'].value = 'porcupine tree'
    browser.submit_form(form)

    # Look up the first song
    songs = browser.select('.song_link')
    browser.follow_link(songs[0])
    lyrics = browser.select('.lyrics')
    lyrics[0].text      # \nHear the sound of music ...

    # Back to results page
    browser.back()

    # Look up my favorite song
    song_link = browser.get_link('trains')
    browser.follow_link(song_link)

    # Can also search HTML using regex patterns
    lyrics = browser.find(class_=re.compile(r'\blyrics\b'))
    lyrics.text         # \nTrain set and match spied under the blind...

RoboBrowser combines the best of two excellent Python libraries:
`Requests <http://docs.python-requests.org/en/latest/>`_ and
`BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/>`_.
RoboBrowser represents browser sessions using Requests and HTML responses
using BeautifulSoup, transparently exposing methods of both libraries:

.. code-block:: python

    import re
    from robobrowser import RoboBrowser

    browser = RoboBrowser(user_agent='a python robot')
    browser.open('https://github.com/')

    # Inspect the browser session
    browser.session.cookies['_gh_sess']         # BAh7Bzo...
    browser.session.headers['User-Agent']       # a python robot

    # Search the parsed HTML
    browser.select('div.teaser-icon')       # [<div class="teaser-icon">
                                            # <span class="mega-octicon octicon-checklist"></span>
                                            # </div>,
                                            # ...
    browser.find(class_=re.compile(r'column', re.I))    # <div class="one-third column">
                                                        # <div class="teaser-icon">
                                                        # <span class="mega-octicon octicon-checklist"></span>
                                                        # ...

You can also pass a custom `Session` instance for lower-level configuration:

.. code-block:: python

    from requests import Session
    from robobrowser import RoboBrowser

    session = Session()
    session.verify = False  # Skip SSL verification
    session.proxies = {'http': 'http://custom.proxy.com/'}  # Set default proxies
    browser = RoboBrowser(session=session)

RoboBrowser also includes tools for working with forms, inspired by
`WebTest <https://github.com/Pylons/webtest>`_ and `Mechanize <http://wwwsearch.sourceforge.net/mechanize/>`_.

.. code-block:: python

    from robobrowser import RoboBrowser

    browser = RoboBrowser()
    browser.open('http://twitter.com')

    # Get the signup form
    signup_form = browser.get_form(class_='signup')
    signup_form         # <RoboForm user[name]=, user[email]=, ...

    # Inspect its values
    signup_form['authenticity_token'].value     # 6d03597 ...

    # Fill it out
    signup_form['user[name]'].value = 'python-robot'
    signup_form['user[user_password]'].value = 'secret'

    # Submit the form
    browser.submit_form(signup_form)

Checkboxes:

.. code-block:: python

    from robobrowser import RoboBrowser

    # Browse to a page with checkbox inputs
    browser = RoboBrowser()
    browser.open('http://www.w3schools.com/html/html_forms.asp')

    # Find the form
    form = browser.get_forms()[3]
    form                            # <RoboForm vehicle=[]>
    form['vehicle']                 # <robobrowser.forms.fields.Checkbox...>

    # Checked values can be get and set like lists
    form['vehicle'].options         # [u'Bike', u'Car']
    form['vehicle'].value           # []
    form['vehicle'].value = ['Bike']
    form['vehicle'].value = ['Bike', 'Car']

    # Values can also be set using input labels
    form['vehicle'].labels          # [u'I have a bike', u'I have a car \r\n']
    form['vehicle'].value = ['I have a bike']
    form['vehicle'].value           # [u'Bike']

    # Only values that correspond to checkbox values or labels can be set;
    # this will raise a `ValueError`
    form['vehicle'].value = ['Hot Dogs']

Uploading files:

.. code-block:: python

    from robobrowser import RoboBrowser

    # Browse to a page with an upload form
    browser = RoboBrowser()
    browser.open('http://cgi-lib.berkeley.edu/ex/fup.html')

    # Find the form
    upload_form = browser.get_form()
    upload_form                     # <RoboForm upfile=, note=>

    # Choose a file to upload
    upload_form['upfile']           # <robobrowser.forms.fields.FileInput...>
    upload_form['upfile'].value = open('path/to/file.txt', 'r')

    # Submit
    browser.submit(upload_form)

By default, creating a browser instantiates a new requests `Session`. 

Requirements
------------

- Python >= 2.6 or >= 3.3

License
-------

MIT licensed. See the bundled `LICENSE <https://github.com/jmcarp/robobrowser/blob/master/LICENSE>`_ file for more details.


            

Raw data

            {
    "_id": null,
    "home_page": "https://github.com/jmcarp/robobrowser",
    "name": "robobrowser",
    "maintainer": null,
    "docs_url": null,
    "requires_python": null,
    "maintainer_email": null,
    "keywords": "robobrowser",
    "author": "Joshua Carp",
    "author_email": "jm.carp@gmail.com",
    "download_url": "https://files.pythonhosted.org/packages/f3/53/fd527e78fe8bdf7ec24b3b821c88c2dcbbe63d120c838f80f12ed0a0fac6/robobrowser-0.5.3.tar.gz",
    "platform": "UNKNOWN",
    "description": "RoboBrowser: Your friendly neighborhood web scraper\n===============================================\n\n.. image:: https://badge.fury.io/py/robobrowser.png\n    :target: http://badge.fury.io/py/robobrowser\n\n.. image:: https://travis-ci.org/jmcarp/robobrowser.png?branch=master\n        :target: https://travis-ci.org/jmcarp/robobrowser\n\n.. image:: https://coveralls.io/repos/jmcarp/robobrowser/badge.png?branch=master\n        :target: https://coveralls.io/r/jmcarp/robobrowser\n\nHomepage: `http://robobrowser.readthedocs.org/ <http://robobrowser.readthedocs.org/>`_\n\nRoboBrowser is a simple, Pythonic library for browsing the web without a standalone web browser. RoboBrowser\ncan fetch a page, click on links and buttons, and fill out and submit forms. If you need to interact with web services\nthat don't have APIs, RoboBrowser can help.\n\n.. code-block:: python\n\n    import re\n    from robobrowser import RoboBrowser\n\n    # Browse to Genius\n    browser = RoboBrowser(history=True)\n    browser.open('http://genius.com/')\n\n    # Search for Porcupine Tree\n    form = browser.get_form(action='/search')\n    form                # <RoboForm q=>\n    form['q'].value = 'porcupine tree'\n    browser.submit_form(form)\n\n    # Look up the first song\n    songs = browser.select('.song_link')\n    browser.follow_link(songs[0])\n    lyrics = browser.select('.lyrics')\n    lyrics[0].text      # \\nHear the sound of music ...\n\n    # Back to results page\n    browser.back()\n\n    # Look up my favorite song\n    song_link = browser.get_link('trains')\n    browser.follow_link(song_link)\n\n    # Can also search HTML using regex patterns\n    lyrics = browser.find(class_=re.compile(r'\\blyrics\\b'))\n    lyrics.text         # \\nTrain set and match spied under the blind...\n\nRoboBrowser combines the best of two excellent Python libraries:\n`Requests <http://docs.python-requests.org/en/latest/>`_ and\n`BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/>`_.\nRoboBrowser represents browser sessions using Requests and HTML responses\nusing BeautifulSoup, transparently exposing methods of both libraries:\n\n.. code-block:: python\n\n    import re\n    from robobrowser import RoboBrowser\n\n    browser = RoboBrowser(user_agent='a python robot')\n    browser.open('https://github.com/')\n\n    # Inspect the browser session\n    browser.session.cookies['_gh_sess']         # BAh7Bzo...\n    browser.session.headers['User-Agent']       # a python robot\n\n    # Search the parsed HTML\n    browser.select('div.teaser-icon')       # [<div class=\"teaser-icon\">\n                                            # <span class=\"mega-octicon octicon-checklist\"></span>\n                                            # </div>,\n                                            # ...\n    browser.find(class_=re.compile(r'column', re.I))    # <div class=\"one-third column\">\n                                                        # <div class=\"teaser-icon\">\n                                                        # <span class=\"mega-octicon octicon-checklist\"></span>\n                                                        # ...\n\nYou can also pass a custom `Session` instance for lower-level configuration:\n\n.. code-block:: python\n\n    from requests import Session\n    from robobrowser import RoboBrowser\n\n    session = Session()\n    session.verify = False  # Skip SSL verification\n    session.proxies = {'http': 'http://custom.proxy.com/'}  # Set default proxies\n    browser = RoboBrowser(session=session)\n\nRoboBrowser also includes tools for working with forms, inspired by\n`WebTest <https://github.com/Pylons/webtest>`_ and `Mechanize <http://wwwsearch.sourceforge.net/mechanize/>`_.\n\n.. code-block:: python\n\n    from robobrowser import RoboBrowser\n\n    browser = RoboBrowser()\n    browser.open('http://twitter.com')\n\n    # Get the signup form\n    signup_form = browser.get_form(class_='signup')\n    signup_form         # <RoboForm user[name]=, user[email]=, ...\n\n    # Inspect its values\n    signup_form['authenticity_token'].value     # 6d03597 ...\n\n    # Fill it out\n    signup_form['user[name]'].value = 'python-robot'\n    signup_form['user[user_password]'].value = 'secret'\n\n    # Submit the form\n    browser.submit_form(signup_form)\n\nCheckboxes:\n\n.. code-block:: python\n\n    from robobrowser import RoboBrowser\n\n    # Browse to a page with checkbox inputs\n    browser = RoboBrowser()\n    browser.open('http://www.w3schools.com/html/html_forms.asp')\n\n    # Find the form\n    form = browser.get_forms()[3]\n    form                            # <RoboForm vehicle=[]>\n    form['vehicle']                 # <robobrowser.forms.fields.Checkbox...>\n\n    # Checked values can be get and set like lists\n    form['vehicle'].options         # [u'Bike', u'Car']\n    form['vehicle'].value           # []\n    form['vehicle'].value = ['Bike']\n    form['vehicle'].value = ['Bike', 'Car']\n\n    # Values can also be set using input labels\n    form['vehicle'].labels          # [u'I have a bike', u'I have a car \\r\\n']\n    form['vehicle'].value = ['I have a bike']\n    form['vehicle'].value           # [u'Bike']\n\n    # Only values that correspond to checkbox values or labels can be set;\n    # this will raise a `ValueError`\n    form['vehicle'].value = ['Hot Dogs']\n\nUploading files:\n\n.. code-block:: python\n\n    from robobrowser import RoboBrowser\n\n    # Browse to a page with an upload form\n    browser = RoboBrowser()\n    browser.open('http://cgi-lib.berkeley.edu/ex/fup.html')\n\n    # Find the form\n    upload_form = browser.get_form()\n    upload_form                     # <RoboForm upfile=, note=>\n\n    # Choose a file to upload\n    upload_form['upfile']           # <robobrowser.forms.fields.FileInput...>\n    upload_form['upfile'].value = open('path/to/file.txt', 'r')\n\n    # Submit\n    browser.submit(upload_form)\n\nBy default, creating a browser instantiates a new requests `Session`. \n\nRequirements\n------------\n\n- Python >= 2.6 or >= 3.3\n\nLicense\n-------\n\nMIT licensed. See the bundled `LICENSE <https://github.com/jmcarp/robobrowser/blob/master/LICENSE>`_ file for more details.\n\n",
    "bugtrack_url": null,
    "license": "MIT",
    "summary": "Your friendly neighborhood web scraper",
    "version": "0.5.3",
    "split_keywords": [
        "robobrowser"
    ],
    "urls": [
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "f353fd527e78fe8bdf7ec24b3b821c88c2dcbbe63d120c838f80f12ed0a0fac6",
                "md5": "333ad401f4a0b320fa873c78bc5fb64d",
                "sha256": "31219acab41ca68adce928e5c1e04acebba4ceabeb447b9c5e408d7b30fee983"
            },
            "downloads": -1,
            "filename": "robobrowser-0.5.3.tar.gz",
            "has_sig": false,
            "md5_digest": "333ad401f4a0b320fa873c78bc5fb64d",
            "packagetype": "sdist",
            "python_version": "source",
            "requires_python": null,
            "size": 22040,
            "upload_time": "2015-06-07T19:47:05",
            "upload_time_iso_8601": "2015-06-07T19:47:05.911178Z",
            "url": "https://files.pythonhosted.org/packages/f3/53/fd527e78fe8bdf7ec24b3b821c88c2dcbbe63d120c838f80f12ed0a0fac6/robobrowser-0.5.3.tar.gz",
            "yanked": false,
            "yanked_reason": null
        }
    ],
    "upload_time": "2015-06-07 19:47:05",
    "github": true,
    "gitlab": false,
    "bitbucket": false,
    "github_user": "jmcarp",
    "github_project": "robobrowser",
    "travis_ci": true,
    "coveralls": true,
    "github_actions": false,
    "tox": true,
    "lcname": "robobrowser"
}
        
Elapsed time: 0.02928s