=========
Memorious
=========
The solitary and lucid spectator of a multiform, instantaneous and almost intolerably precise world.
-- `Funes the Memorious <http://users.clas.ufl.edu/burt/spaceshotsairheads/borges-funes.pdf>`_,
Jorge Luis Borges
.. image:: https://github.com/alephdata/memorious/workflows/memorious/badge.svg
``memorious`` is a light-weight web scraping toolkit. It supports scrapers that
collect structured or un-structured data. This includes the following use cases:
* Make crawlers modular and simple tasks re-usable
* Provide utility functions to do common tasks such as data storage, HTTP session management
* Integrate crawlers with the Aleph and FollowTheMoney ecosystem
* Get out of your way as much as possible
Design
------
When writing a scraper, you often need to paginate through through an index
page, then download an HTML page for each result and finally parse that page
and insert or update a record in a database.
``memorious`` handles this by managing a set of ``crawlers``, each of which
can be composed of multiple ``stages``. Each ``stage`` is implemented using a
Python function, which can be re-used across different ``crawlers``.
The basic steps of writing a Memorious crawler:
1. Make YAML crawler configuration file
2. Add different stages
3. Write code for stage operations (optional)
4. Test, rinse, repeat
Documentation
-------------
The documentation for Memorious is available at
`alephdata.github.io/memorious <https://alephdata.github.io/memorious/>`_.
Feel free to edit the source files in the ``docs`` folder and send pull requests for improvements.
To build the documentation, inside the ``docs`` folder run ``make html``
You'll find the resulting HTML files in /docs/_build/html.
Raw data
{
"_id": null,
"home_page": "http://github.com/alephdata/memorious",
"name": "memorious",
"maintainer": "",
"docs_url": null,
"requires_python": "",
"maintainer_email": "",
"keywords": "",
"author": "Organized Crime and Corruption Reporting Project",
"author_email": "data@occrp.org",
"download_url": "https://files.pythonhosted.org/packages/50/85/97ec7c1f8bdd90f73347b3972a5b6c663f5995e7a49e4cd3f73c46af8510/memorious-2.6.5.tar.gz",
"platform": null,
"description": "=========\nMemorious\n=========\n\n The solitary and lucid spectator of a multiform, instantaneous and almost intolerably precise world.\n\n -- `Funes the Memorious <http://users.clas.ufl.edu/burt/spaceshotsairheads/borges-funes.pdf>`_,\n Jorge Luis Borges\n\n.. image:: https://github.com/alephdata/memorious/workflows/memorious/badge.svg\n\n``memorious`` is a light-weight web scraping toolkit. It supports scrapers that\ncollect structured or un-structured data. This includes the following use cases:\n\n* Make crawlers modular and simple tasks re-usable\n* Provide utility functions to do common tasks such as data storage, HTTP session management\n* Integrate crawlers with the Aleph and FollowTheMoney ecosystem\n* Get out of your way as much as possible\n\nDesign\n------\n\nWhen writing a scraper, you often need to paginate through through an index\npage, then download an HTML page for each result and finally parse that page\nand insert or update a record in a database.\n\n``memorious`` handles this by managing a set of ``crawlers``, each of which \ncan be composed of multiple ``stages``. Each ``stage`` is implemented using a\nPython function, which can be re-used across different ``crawlers``.\n\nThe basic steps of writing a Memorious crawler:\n\n1. Make YAML crawler configuration file\n2. Add different stages\n3. Write code for stage operations (optional)\n4. Test, rinse, repeat\n\nDocumentation\n-------------\n\nThe documentation for Memorious is available at\n`alephdata.github.io/memorious <https://alephdata.github.io/memorious/>`_.\nFeel free to edit the source files in the ``docs`` folder and send pull requests for improvements.\n\nTo build the documentation, inside the ``docs`` folder run ``make html``\n\nYou'll find the resulting HTML files in /docs/_build/html.\n",
"bugtrack_url": null,
"license": "MIT",
"summary": "A minimalistic, recursive web crawling library for Python.",
"version": "2.6.5",
"project_urls": {
"Homepage": "http://github.com/alephdata/memorious"
},
"split_keywords": [],
"urls": [
{
"comment_text": "",
"digests": {
"blake2b_256": "70dcf8543dbc42b92a041bfa59a5aa57e61e9a8906cd8bde1bcbe6fcca51dbe1",
"md5": "3c64862426bff79ca7744238b1057dcd",
"sha256": "5997259e0e5e3e92012bd87d506dfd947f8900c53e8c5717696169a523c48780"
},
"downloads": -1,
"filename": "memorious-2.6.5-py2.py3-none-any.whl",
"has_sig": false,
"md5_digest": "3c64862426bff79ca7744238b1057dcd",
"packagetype": "bdist_wheel",
"python_version": "py2.py3",
"requires_python": null,
"size": 52425,
"upload_time": "2024-01-10T16:15:55",
"upload_time_iso_8601": "2024-01-10T16:15:55.559305Z",
"url": "https://files.pythonhosted.org/packages/70/dc/f8543dbc42b92a041bfa59a5aa57e61e9a8906cd8bde1bcbe6fcca51dbe1/memorious-2.6.5-py2.py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": "",
"digests": {
"blake2b_256": "508597ec7c1f8bdd90f73347b3972a5b6c663f5995e7a49e4cd3f73c46af8510",
"md5": "69beecfbb546ca35eff82b47771f8ef6",
"sha256": "5690d32309cc7a269190bd157df7b6a4c9f9f9e896367ea1ba02d483c211e76d"
},
"downloads": -1,
"filename": "memorious-2.6.5.tar.gz",
"has_sig": false,
"md5_digest": "69beecfbb546ca35eff82b47771f8ef6",
"packagetype": "sdist",
"python_version": "source",
"requires_python": null,
"size": 41044,
"upload_time": "2024-01-10T16:15:57",
"upload_time_iso_8601": "2024-01-10T16:15:57.830345Z",
"url": "https://files.pythonhosted.org/packages/50/85/97ec7c1f8bdd90f73347b3972a5b6c663f5995e7a49e4cd3f73c46af8510/memorious-2.6.5.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2024-01-10 16:15:57",
"github": true,
"gitlab": false,
"bitbucket": false,
"codeberg": false,
"github_user": "alephdata",
"github_project": "memorious",
"travis_ci": false,
"coveralls": false,
"github_actions": true,
"tox": true,
"lcname": "memorious"
}