# lxml to pandas for fast web scraping
## Tested against Windows / Python 3.11 / Anaconda
## pip install lxml2pandas
```python
from lxml2pandas import subprocess_parsing
from PrettyColorPrinter import add_printer
add_printer(1)
htmldata = [
("bet365", r"C:\Users\hansc\Downloads\bet365 - Apostas Desportivas Online.mhtml"),
(
"betano",
r"C:\Users\hansc\Downloads\Brasil Brasileirão - Série A Apostas - Futebol Odds _ Betano.mhtml",
),
("sportingbet", r"C:\Users\hansc\Downloads\Apostas Futebol _ Sportingbet.mhtml"),
]
allframes = []
df = subprocess_parsing(
htmldata,
chunks=1,
processes=5,
fake_header=True,
print_stdout=True,
print_stderr=True,
)
for child in df.loc[df.aa_attr_values == "ovm-Fixture_Container"].aa_all_children:
dfr = df.loc[
df.aa_element_id.isin(child)
& (
(df.aa_attr_values == "ovm-FixtureDetailsTwoWay_TeamName")
| (df.aa_attr_values == "ovm-ParticipantOddsOnly_Odds")
)
]
if len(dfr) == 5:
print(dfr)
chi = df.loc[df.aa_attr_values == "events-list__grid__event"].aa_all_children
for c in chi:
print(
df.loc[
(df.aa_element_id.isin(c))
& (df.aa_doc_id == "betano")
& (
(
(df.aa_tag == "span")
& (df.aa_attr_values == "selections__selection__odd")
)
| (
(df.aa_tag == "span")
& (df.aa_attr_values.str.contains("participant-name", na=False))
)
)
]
)
df = subprocess_parsing(
htmldata,
chunks=1,
processes=5,
fake_header=True,
print_stdout=True,
print_stderr=True,
)
# pre-filter
df0 = subprocess_parsing(
htmldata,
chunks=1,
processes=5,
fake_header=True,
print_stdout=True,
print_stderr=True,
allowed_tags=("span", "div"),
)
df1 = subprocess_parsing(
htmldata,
chunks=1,
processes=5,
fake_header=True,
print_stdout=True,
print_stderr=True,
allowed_tags=("span",),
)
df2 = subprocess_parsing(
htmldata,
chunks=1,
processes=5,
fake_header=True,
print_stdout=True,
print_stderr=True,
allowed_tags=("span", "div"),
allowed_attr=(
"ovm-Fixture_Container",
"ovm-FixtureDetailsTwoWay_TeamName",
"ovm-ParticipantOddsOnly_Odds",
),
)
df3 = subprocess_parsing(
htmldata,
chunks=1,
processes=5,
fake_header=True,
print_stdout=True,
print_stderr=True,
allowed_tags=(),
allowed_attr=("ovm-ParticipantOddsOnly_Odds",),
forbidden_tags=("p",),
)
df4 = subprocess_parsing(
htmldata,
chunks=1,
processes=5,
fake_header=True,
print_stdout=True,
print_stderr=True,
allowed_tags=(),
allowed_attr=(
"ovm-Fixture_Container",
"ovm-FixtureDetailsTwoWay_TeamName",
"ovm-ParticipantOddsOnly_Odds",
"events-list__grid__even",
"selections__selection__odd",
"events-list__grid__info__main__participants__participant-name tw-truncate",
),
allowed_attr_keys=("class",),
)
df5 = subprocess_parsing(
htmldata,
chunks=1,
processes=5,
fake_header=True,
print_stdout=True,
print_stderr=True,
allowed_tags=(),
allowed_attr=("ovm-Fixture_Container",),
allowed_attr_keys=("class",),
)
# parse a webpage:
df = subprocess_parsing(
[("python", "https://www.python.org/")],
chunks=1,
processes=5,
fake_header=True,
print_stdout=True,
print_stderr=True,
allowed_tags=(),
allowed_attr=(),
allowed_attr_keys=(),
)
# Generate a column with css selectors
from lxml2pandas import subprocess_parsing,pd_add_generate_css_selector
pd_add_generate_css_selector()
htmldata = [
("bet365", r"C:\Users\hansc\Downloads\bet365 - Apostas Desportivas Online2.mhtml"),
]
df = subprocess_parsing(
htmldata,
chunks=1,
processes=5,
fake_header=True,
print_stdout=False,
print_stderr=True,
)
df = df.s_generate_css_selector()
```
Raw data
{
"_id": null,
"home_page": "https://github.com/hansalemaos/lxml2pandas",
"name": "lxml2pandas",
"maintainer": "",
"docs_url": null,
"requires_python": "",
"maintainer_email": "",
"keywords": "lxml,pandas,web scraping",
"author": "Johannes Fischer",
"author_email": "aulasparticularesdealemaosp@gmail.com",
"download_url": "https://files.pythonhosted.org/packages/5b/35/1bdcb067c75c42d993ff2d65fdc1b85e799c46d73a1e2b276841f06e2744/lxml2pandas-0.16.tar.gz",
"platform": null,
"description": "\r\n# lxml to pandas for fast web scraping\r\n\r\n## Tested against Windows / Python 3.11 / Anaconda\r\n\r\n## pip install lxml2pandas\r\n\r\n```python\r\nfrom lxml2pandas import subprocess_parsing\r\nfrom PrettyColorPrinter import add_printer\r\n\r\nadd_printer(1)\r\n\r\nhtmldata = [\r\n (\"bet365\", r\"C:\\Users\\hansc\\Downloads\\bet365 - Apostas Desportivas Online.mhtml\"),\r\n (\r\n \"betano\",\r\n r\"C:\\Users\\hansc\\Downloads\\Brasil Brasileir\u00e3o - S\u00e9rie A Apostas - Futebol Odds _ Betano.mhtml\",\r\n ),\r\n (\"sportingbet\", r\"C:\\Users\\hansc\\Downloads\\Apostas Futebol _ Sportingbet.mhtml\"),\r\n]\r\nallframes = []\r\ndf = subprocess_parsing(\r\n htmldata,\r\n chunks=1,\r\n processes=5,\r\n fake_header=True,\r\n print_stdout=True,\r\n print_stderr=True,\r\n)\r\n\r\nfor child in df.loc[df.aa_attr_values == \"ovm-Fixture_Container\"].aa_all_children:\r\n dfr = df.loc[\r\n df.aa_element_id.isin(child)\r\n & (\r\n (df.aa_attr_values == \"ovm-FixtureDetailsTwoWay_TeamName\")\r\n | (df.aa_attr_values == \"ovm-ParticipantOddsOnly_Odds\")\r\n )\r\n ]\r\n if len(dfr) == 5:\r\n print(dfr)\r\n\r\nchi = df.loc[df.aa_attr_values == \"events-list__grid__event\"].aa_all_children\r\nfor c in chi:\r\n print(\r\n df.loc[\r\n (df.aa_element_id.isin(c))\r\n & (df.aa_doc_id == \"betano\")\r\n & (\r\n (\r\n (df.aa_tag == \"span\")\r\n & (df.aa_attr_values == \"selections__selection__odd\")\r\n )\r\n | (\r\n (df.aa_tag == \"span\")\r\n & (df.aa_attr_values.str.contains(\"participant-name\", na=False))\r\n )\r\n )\r\n ]\r\n )\r\n\r\n\r\ndf = subprocess_parsing(\r\n htmldata,\r\n chunks=1,\r\n processes=5,\r\n fake_header=True,\r\n print_stdout=True,\r\n print_stderr=True,\r\n)\r\n\r\n# pre-filter\r\ndf0 = subprocess_parsing(\r\n htmldata,\r\n chunks=1,\r\n processes=5,\r\n fake_header=True,\r\n print_stdout=True,\r\n print_stderr=True,\r\n allowed_tags=(\"span\", \"div\"),\r\n)\r\ndf1 = subprocess_parsing(\r\n htmldata,\r\n chunks=1,\r\n processes=5,\r\n fake_header=True,\r\n print_stdout=True,\r\n print_stderr=True,\r\n allowed_tags=(\"span\",),\r\n)\r\ndf2 = subprocess_parsing(\r\n htmldata,\r\n chunks=1,\r\n processes=5,\r\n fake_header=True,\r\n print_stdout=True,\r\n print_stderr=True,\r\n allowed_tags=(\"span\", \"div\"),\r\n allowed_attr=(\r\n \"ovm-Fixture_Container\",\r\n \"ovm-FixtureDetailsTwoWay_TeamName\",\r\n \"ovm-ParticipantOddsOnly_Odds\",\r\n ),\r\n)\r\n\r\ndf3 = subprocess_parsing(\r\n htmldata,\r\n chunks=1,\r\n processes=5,\r\n fake_header=True,\r\n print_stdout=True,\r\n print_stderr=True,\r\n allowed_tags=(),\r\n allowed_attr=(\"ovm-ParticipantOddsOnly_Odds\",),\r\n forbidden_tags=(\"p\",),\r\n)\r\n\r\ndf4 = subprocess_parsing(\r\n htmldata,\r\n chunks=1,\r\n processes=5,\r\n fake_header=True,\r\n print_stdout=True,\r\n print_stderr=True,\r\n allowed_tags=(),\r\n allowed_attr=(\r\n \"ovm-Fixture_Container\",\r\n \"ovm-FixtureDetailsTwoWay_TeamName\",\r\n \"ovm-ParticipantOddsOnly_Odds\",\r\n \"events-list__grid__even\",\r\n \"selections__selection__odd\",\r\n \"events-list__grid__info__main__participants__participant-name tw-truncate\",\r\n ),\r\n allowed_attr_keys=(\"class\",),\r\n)\r\n\r\n\r\ndf5 = subprocess_parsing(\r\n htmldata,\r\n chunks=1,\r\n processes=5,\r\n fake_header=True,\r\n print_stdout=True,\r\n print_stderr=True,\r\n allowed_tags=(),\r\n allowed_attr=(\"ovm-Fixture_Container\",),\r\n allowed_attr_keys=(\"class\",),\r\n)\r\n\r\n# parse a webpage:\r\ndf = subprocess_parsing(\r\n [(\"python\", \"https://www.python.org/\")],\r\n chunks=1,\r\n processes=5,\r\n fake_header=True,\r\n print_stdout=True,\r\n print_stderr=True,\r\n allowed_tags=(),\r\n allowed_attr=(),\r\n allowed_attr_keys=(),\r\n)\r\n\r\n\r\n\r\n# Generate a column with css selectors \r\n\r\nfrom lxml2pandas import subprocess_parsing,pd_add_generate_css_selector\r\npd_add_generate_css_selector()\r\nhtmldata = [\r\n (\"bet365\", r\"C:\\Users\\hansc\\Downloads\\bet365 - Apostas Desportivas Online2.mhtml\"),\r\n]\r\ndf = subprocess_parsing(\r\n htmldata,\r\n chunks=1,\r\n processes=5,\r\n fake_header=True,\r\n print_stdout=False,\r\n print_stderr=True,\r\n)\r\ndf = df.s_generate_css_selector()\r\n\r\n```\r\n",
"bugtrack_url": null,
"license": "MIT",
"summary": "lxml to pandas for fast web scraping",
"version": "0.16",
"project_urls": {
"Homepage": "https://github.com/hansalemaos/lxml2pandas"
},
"split_keywords": [
"lxml",
"pandas",
"web scraping"
],
"urls": [
{
"comment_text": "",
"digests": {
"blake2b_256": "722675481372f0ae77dbd9da260e0a9f1163899836433caa8094f6056d6a84e7",
"md5": "e9b670d0e906763f9ade09ea4325692b",
"sha256": "34990d5e93a6f50dd8cda1781e07a159c2cb386a4b93b3da20a4aa2d59e0208b"
},
"downloads": -1,
"filename": "lxml2pandas-0.16-py3-none-any.whl",
"has_sig": false,
"md5_digest": "e9b670d0e906763f9ade09ea4325692b",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": null,
"size": 40111,
"upload_time": "2023-11-19T20:52:08",
"upload_time_iso_8601": "2023-11-19T20:52:08.789248Z",
"url": "https://files.pythonhosted.org/packages/72/26/75481372f0ae77dbd9da260e0a9f1163899836433caa8094f6056d6a84e7/lxml2pandas-0.16-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": "",
"digests": {
"blake2b_256": "5b351bdcb067c75c42d993ff2d65fdc1b85e799c46d73a1e2b276841f06e2744",
"md5": "a5ac5bd05a4ff9ea6f05a4568d638f83",
"sha256": "75656182971c424402b1f04f472aed62562e4694b5002662dc0597093451ffe4"
},
"downloads": -1,
"filename": "lxml2pandas-0.16.tar.gz",
"has_sig": false,
"md5_digest": "a5ac5bd05a4ff9ea6f05a4568d638f83",
"packagetype": "sdist",
"python_version": "source",
"requires_python": null,
"size": 40320,
"upload_time": "2023-11-19T20:52:10",
"upload_time_iso_8601": "2023-11-19T20:52:10.949009Z",
"url": "https://files.pythonhosted.org/packages/5b/35/1bdcb067c75c42d993ff2d65fdc1b85e799c46d73a1e2b276841f06e2744/lxml2pandas-0.16.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2023-11-19 20:52:10",
"github": true,
"gitlab": false,
"bitbucket": false,
"codeberg": false,
"github_user": "hansalemaos",
"github_project": "lxml2pandas",
"travis_ci": false,
"coveralls": false,
"github_actions": false,
"requirements": [
{
"name": "a_pandas_ex_apply_ignore_exceptions",
"specs": []
},
{
"name": "fake_headers",
"specs": []
},
{
"name": "flatten_everything",
"specs": []
},
{
"name": "lxml",
"specs": []
},
{
"name": "multiprocnomain",
"specs": []
},
{
"name": "numpy",
"specs": []
},
{
"name": "pandas",
"specs": []
},
{
"name": "requests",
"specs": []
}
],
"lcname": "lxml2pandas"
}