# scrapes a website (Selenium, SeleniumBase, undetected chromedriver ...) with iframes and returns a DataFrame
## Tested against Windows / Python 3.11 / Anaconda
## pip install multiiframes2df
```
Scrapes data using the provided driver and processes it to return a DataFrame
which includes each element and its children.
Args:
driver: The driver used to scrape the data.
filter_function: A function to filter the scraped data (default is None).
chunks: The number of chunks to divide the data into for processing (default is 1).
processes: The number of processes to use for parallel processing (default is 4).
print_stdout: Boolean indicating whether to print stdout (default is False).
print_stderr: Boolean indicating whether to print stderr (default is True).
Returns:
pandas DataFrame: The processed and filtered data.
Example:
from PrettyColorPrinter import add_printer # optional
from seleniumbase import Driver
from multiiframes2df import fast_scrape
add_printer(1)
driver = Driver(uc=True, undetected=True)
driver.get(r"https://testpages.herokuapp.com/styled/iframes-test.html")
df = fast_scrape(
driver=driver,
filter_function=None,
chunks=1,
processes=4,
print_stdout=False,
print_stderr=True,
)
for name, group in df.groupby("aa_groupnumber"):
print(name, group)
df2 = fast_scrape(
driver=driver,
filter_function=lambda x: "List" in x and "<html>" not in x and "<body>" not in x,
chunks=1,
processes=4,
print_stdout=False,
print_stderr=True,
)
for name, group in df2.groupby("aa_groupnumber"):
print(name, group)
```
Raw data
{
"_id": null,
"home_page": "https://github.com/hansalemaos/multiiframes2df",
"name": "multiiframes2df",
"maintainer": "",
"docs_url": null,
"requires_python": "",
"maintainer_email": "",
"keywords": "iframe,scrape",
"author": "Johannes Fischer",
"author_email": "aulasparticularesdealemaosp@gmail.com",
"download_url": "https://files.pythonhosted.org/packages/eb/98/4f075cc55800d99a2e17b11dc3d81c105fba6e1ddd36b7297b484d5399a6/multiiframes2df-0.10.tar.gz",
"platform": null,
"description": "\r\n# scrapes a website (Selenium, SeleniumBase, undetected chromedriver ...) with iframes and returns a DataFrame\r\n\r\n## Tested against Windows / Python 3.11 / Anaconda\r\n\r\n## pip install multiiframes2df\r\n\r\n```\r\nScrapes data using the provided driver and processes it to return a DataFrame\r\nwhich includes each element and its children.\r\n\r\nArgs:\r\n\tdriver: The driver used to scrape the data.\r\n\tfilter_function: A function to filter the scraped data (default is None).\r\n\tchunks: The number of chunks to divide the data into for processing (default is 1).\r\n\tprocesses: The number of processes to use for parallel processing (default is 4).\r\n\tprint_stdout: Boolean indicating whether to print stdout (default is False).\r\n\tprint_stderr: Boolean indicating whether to print stderr (default is True).\r\n\r\nReturns:\r\n\tpandas DataFrame: The processed and filtered data.\r\n\r\nExample:\r\n\tfrom PrettyColorPrinter import add_printer # optional\r\n\tfrom seleniumbase import Driver\r\n\tfrom multiiframes2df import fast_scrape\r\n\r\n\tadd_printer(1)\r\n\tdriver = Driver(uc=True, undetected=True)\r\n\tdriver.get(r\"https://testpages.herokuapp.com/styled/iframes-test.html\")\r\n\tdf = fast_scrape(\r\n\t\tdriver=driver,\r\n\t\tfilter_function=None,\r\n\t\tchunks=1,\r\n\t\tprocesses=4,\r\n\t\tprint_stdout=False,\r\n\t\tprint_stderr=True,\r\n\t)\r\n\tfor name, group in df.groupby(\"aa_groupnumber\"):\r\n\t\tprint(name, group)\r\n\r\n\r\n\tdf2 = fast_scrape(\r\n\t\tdriver=driver,\r\n\t\tfilter_function=lambda x: \"List\" in x and \"<html>\" not in x and \"<body>\" not in x,\r\n\t\tchunks=1,\r\n\t\tprocesses=4,\r\n\t\tprint_stdout=False,\r\n\t\tprint_stderr=True,\r\n\t)\r\n\tfor name, group in df2.groupby(\"aa_groupnumber\"):\r\n\t\tprint(name, group)\r\n\t\t\r\n```\r\n",
"bugtrack_url": null,
"license": "MIT",
"summary": "scrapes a website (Selenium, SeleniumBase, undetected chromedriver ...) with iframes and returns a DataFrame",
"version": "0.10",
"project_urls": {
"Homepage": "https://github.com/hansalemaos/multiiframes2df"
},
"split_keywords": [
"iframe",
"scrape"
],
"urls": [
{
"comment_text": "",
"digests": {
"blake2b_256": "6d90c2b2b2a80214864b04f3bacf5a58673807cde524c2514e9517ac2757de0c",
"md5": "f721efae92d6caef0d2afde4dafe0a06",
"sha256": "f6ce8bafa8595539ff42478cbfb61e05441a7edaae6ac8c5a5198011b6a51d0c"
},
"downloads": -1,
"filename": "multiiframes2df-0.10-py3-none-any.whl",
"has_sig": false,
"md5_digest": "f721efae92d6caef0d2afde4dafe0a06",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": null,
"size": 7494,
"upload_time": "2024-02-02T04:35:37",
"upload_time_iso_8601": "2024-02-02T04:35:37.575365Z",
"url": "https://files.pythonhosted.org/packages/6d/90/c2b2b2a80214864b04f3bacf5a58673807cde524c2514e9517ac2757de0c/multiiframes2df-0.10-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": "",
"digests": {
"blake2b_256": "eb984f075cc55800d99a2e17b11dc3d81c105fba6e1ddd36b7297b484d5399a6",
"md5": "10db62ca570e0e6448f6ba5821414d28",
"sha256": "9aa4811becaf973e2ddc665f46c6b897abf3c12828cd4915174116a4f0b29cb9"
},
"downloads": -1,
"filename": "multiiframes2df-0.10.tar.gz",
"has_sig": false,
"md5_digest": "10db62ca570e0e6448f6ba5821414d28",
"packagetype": "sdist",
"python_version": "source",
"requires_python": null,
"size": 5590,
"upload_time": "2024-02-02T04:35:39",
"upload_time_iso_8601": "2024-02-02T04:35:39.642300Z",
"url": "https://files.pythonhosted.org/packages/eb/98/4f075cc55800d99a2e17b11dc3d81c105fba6e1ddd36b7297b484d5399a6/multiiframes2df-0.10.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2024-02-02 04:35:39",
"github": true,
"gitlab": false,
"bitbucket": false,
"codeberg": false,
"github_user": "hansalemaos",
"github_project": "multiiframes2df",
"travis_ci": false,
"coveralls": false,
"github_actions": false,
"requirements": [
{
"name": "lxml2pandas",
"specs": []
},
{
"name": "pandas",
"specs": []
}
],
"lcname": "multiiframes2df"
}