lxml2pandas


Namelxml2pandas JSON
Version 0.16 PyPI version JSON
download
home_pagehttps://github.com/hansalemaos/lxml2pandas
Summarylxml to pandas for fast web scraping
upload_time2023-11-19 20:52:10
maintainer
docs_urlNone
authorJohannes Fischer
requires_python
licenseMIT
keywords lxml pandas web scraping
VCS
bugtrack_url
requirements a_pandas_ex_apply_ignore_exceptions fake_headers flatten_everything lxml multiprocnomain numpy pandas requests
Travis-CI No Travis.
coveralls test coverage No coveralls.
            
# lxml to pandas for fast web scraping

## Tested against Windows / Python 3.11 / Anaconda

## pip install lxml2pandas

```python
from lxml2pandas import subprocess_parsing
from PrettyColorPrinter import add_printer

add_printer(1)

htmldata = [
    ("bet365", r"C:\Users\hansc\Downloads\bet365 - Apostas Desportivas Online.mhtml"),
    (
        "betano",
        r"C:\Users\hansc\Downloads\Brasil Brasileirão - Série A Apostas - Futebol Odds _ Betano.mhtml",
    ),
    ("sportingbet", r"C:\Users\hansc\Downloads\Apostas Futebol _ Sportingbet.mhtml"),
]
allframes = []
df = subprocess_parsing(
    htmldata,
    chunks=1,
    processes=5,
    fake_header=True,
    print_stdout=True,
    print_stderr=True,
)

for child in df.loc[df.aa_attr_values == "ovm-Fixture_Container"].aa_all_children:
    dfr = df.loc[
        df.aa_element_id.isin(child)
        & (
            (df.aa_attr_values == "ovm-FixtureDetailsTwoWay_TeamName")
            | (df.aa_attr_values == "ovm-ParticipantOddsOnly_Odds")
        )
    ]
    if len(dfr) == 5:
        print(dfr)

chi = df.loc[df.aa_attr_values == "events-list__grid__event"].aa_all_children
for c in chi:
    print(
        df.loc[
            (df.aa_element_id.isin(c))
            & (df.aa_doc_id == "betano")
            & (
                (
                    (df.aa_tag == "span")
                    & (df.aa_attr_values == "selections__selection__odd")
                )
                | (
                    (df.aa_tag == "span")
                    & (df.aa_attr_values.str.contains("participant-name", na=False))
                )
            )
        ]
    )


df = subprocess_parsing(
    htmldata,
    chunks=1,
    processes=5,
    fake_header=True,
    print_stdout=True,
    print_stderr=True,
)

# pre-filter
df0 = subprocess_parsing(
    htmldata,
    chunks=1,
    processes=5,
    fake_header=True,
    print_stdout=True,
    print_stderr=True,
    allowed_tags=("span", "div"),
)
df1 = subprocess_parsing(
    htmldata,
    chunks=1,
    processes=5,
    fake_header=True,
    print_stdout=True,
    print_stderr=True,
    allowed_tags=("span",),
)
df2 = subprocess_parsing(
    htmldata,
    chunks=1,
    processes=5,
    fake_header=True,
    print_stdout=True,
    print_stderr=True,
    allowed_tags=("span", "div"),
    allowed_attr=(
        "ovm-Fixture_Container",
        "ovm-FixtureDetailsTwoWay_TeamName",
        "ovm-ParticipantOddsOnly_Odds",
    ),
)

df3 = subprocess_parsing(
    htmldata,
    chunks=1,
    processes=5,
    fake_header=True,
    print_stdout=True,
    print_stderr=True,
    allowed_tags=(),
    allowed_attr=("ovm-ParticipantOddsOnly_Odds",),
    forbidden_tags=("p",),
)

df4 = subprocess_parsing(
    htmldata,
    chunks=1,
    processes=5,
    fake_header=True,
    print_stdout=True,
    print_stderr=True,
    allowed_tags=(),
    allowed_attr=(
        "ovm-Fixture_Container",
        "ovm-FixtureDetailsTwoWay_TeamName",
        "ovm-ParticipantOddsOnly_Odds",
        "events-list__grid__even",
        "selections__selection__odd",
        "events-list__grid__info__main__participants__participant-name tw-truncate",
    ),
    allowed_attr_keys=("class",),
)


df5 = subprocess_parsing(
    htmldata,
    chunks=1,
    processes=5,
    fake_header=True,
    print_stdout=True,
    print_stderr=True,
    allowed_tags=(),
    allowed_attr=("ovm-Fixture_Container",),
    allowed_attr_keys=("class",),
)

# parse a webpage:
df = subprocess_parsing(
    [("python", "https://www.python.org/")],
    chunks=1,
    processes=5,
    fake_header=True,
    print_stdout=True,
    print_stderr=True,
    allowed_tags=(),
    allowed_attr=(),
    allowed_attr_keys=(),
)



# Generate a column with css selectors 

from lxml2pandas import subprocess_parsing,pd_add_generate_css_selector
pd_add_generate_css_selector()
htmldata = [
    ("bet365", r"C:\Users\hansc\Downloads\bet365 - Apostas Desportivas Online2.mhtml"),
]
df = subprocess_parsing(
    htmldata,
    chunks=1,
    processes=5,
    fake_header=True,
    print_stdout=False,
    print_stderr=True,
)
df = df.s_generate_css_selector()

```

            

Raw data

            {
    "_id": null,
    "home_page": "https://github.com/hansalemaos/lxml2pandas",
    "name": "lxml2pandas",
    "maintainer": "",
    "docs_url": null,
    "requires_python": "",
    "maintainer_email": "",
    "keywords": "lxml,pandas,web scraping",
    "author": "Johannes Fischer",
    "author_email": "aulasparticularesdealemaosp@gmail.com",
    "download_url": "https://files.pythonhosted.org/packages/5b/35/1bdcb067c75c42d993ff2d65fdc1b85e799c46d73a1e2b276841f06e2744/lxml2pandas-0.16.tar.gz",
    "platform": null,
    "description": "\r\n# lxml to pandas for fast web scraping\r\n\r\n## Tested against Windows / Python 3.11 / Anaconda\r\n\r\n## pip install lxml2pandas\r\n\r\n```python\r\nfrom lxml2pandas import subprocess_parsing\r\nfrom PrettyColorPrinter import add_printer\r\n\r\nadd_printer(1)\r\n\r\nhtmldata = [\r\n    (\"bet365\", r\"C:\\Users\\hansc\\Downloads\\bet365 - Apostas Desportivas Online.mhtml\"),\r\n    (\r\n        \"betano\",\r\n        r\"C:\\Users\\hansc\\Downloads\\Brasil Brasileir\u00e3o - S\u00e9rie A Apostas - Futebol Odds _ Betano.mhtml\",\r\n    ),\r\n    (\"sportingbet\", r\"C:\\Users\\hansc\\Downloads\\Apostas Futebol _ Sportingbet.mhtml\"),\r\n]\r\nallframes = []\r\ndf = subprocess_parsing(\r\n    htmldata,\r\n    chunks=1,\r\n    processes=5,\r\n    fake_header=True,\r\n    print_stdout=True,\r\n    print_stderr=True,\r\n)\r\n\r\nfor child in df.loc[df.aa_attr_values == \"ovm-Fixture_Container\"].aa_all_children:\r\n    dfr = df.loc[\r\n        df.aa_element_id.isin(child)\r\n        & (\r\n            (df.aa_attr_values == \"ovm-FixtureDetailsTwoWay_TeamName\")\r\n            | (df.aa_attr_values == \"ovm-ParticipantOddsOnly_Odds\")\r\n        )\r\n    ]\r\n    if len(dfr) == 5:\r\n        print(dfr)\r\n\r\nchi = df.loc[df.aa_attr_values == \"events-list__grid__event\"].aa_all_children\r\nfor c in chi:\r\n    print(\r\n        df.loc[\r\n            (df.aa_element_id.isin(c))\r\n            & (df.aa_doc_id == \"betano\")\r\n            & (\r\n                (\r\n                    (df.aa_tag == \"span\")\r\n                    & (df.aa_attr_values == \"selections__selection__odd\")\r\n                )\r\n                | (\r\n                    (df.aa_tag == \"span\")\r\n                    & (df.aa_attr_values.str.contains(\"participant-name\", na=False))\r\n                )\r\n            )\r\n        ]\r\n    )\r\n\r\n\r\ndf = subprocess_parsing(\r\n    htmldata,\r\n    chunks=1,\r\n    processes=5,\r\n    fake_header=True,\r\n    print_stdout=True,\r\n    print_stderr=True,\r\n)\r\n\r\n# pre-filter\r\ndf0 = subprocess_parsing(\r\n    htmldata,\r\n    chunks=1,\r\n    processes=5,\r\n    fake_header=True,\r\n    print_stdout=True,\r\n    print_stderr=True,\r\n    allowed_tags=(\"span\", \"div\"),\r\n)\r\ndf1 = subprocess_parsing(\r\n    htmldata,\r\n    chunks=1,\r\n    processes=5,\r\n    fake_header=True,\r\n    print_stdout=True,\r\n    print_stderr=True,\r\n    allowed_tags=(\"span\",),\r\n)\r\ndf2 = subprocess_parsing(\r\n    htmldata,\r\n    chunks=1,\r\n    processes=5,\r\n    fake_header=True,\r\n    print_stdout=True,\r\n    print_stderr=True,\r\n    allowed_tags=(\"span\", \"div\"),\r\n    allowed_attr=(\r\n        \"ovm-Fixture_Container\",\r\n        \"ovm-FixtureDetailsTwoWay_TeamName\",\r\n        \"ovm-ParticipantOddsOnly_Odds\",\r\n    ),\r\n)\r\n\r\ndf3 = subprocess_parsing(\r\n    htmldata,\r\n    chunks=1,\r\n    processes=5,\r\n    fake_header=True,\r\n    print_stdout=True,\r\n    print_stderr=True,\r\n    allowed_tags=(),\r\n    allowed_attr=(\"ovm-ParticipantOddsOnly_Odds\",),\r\n    forbidden_tags=(\"p\",),\r\n)\r\n\r\ndf4 = subprocess_parsing(\r\n    htmldata,\r\n    chunks=1,\r\n    processes=5,\r\n    fake_header=True,\r\n    print_stdout=True,\r\n    print_stderr=True,\r\n    allowed_tags=(),\r\n    allowed_attr=(\r\n        \"ovm-Fixture_Container\",\r\n        \"ovm-FixtureDetailsTwoWay_TeamName\",\r\n        \"ovm-ParticipantOddsOnly_Odds\",\r\n        \"events-list__grid__even\",\r\n        \"selections__selection__odd\",\r\n        \"events-list__grid__info__main__participants__participant-name tw-truncate\",\r\n    ),\r\n    allowed_attr_keys=(\"class\",),\r\n)\r\n\r\n\r\ndf5 = subprocess_parsing(\r\n    htmldata,\r\n    chunks=1,\r\n    processes=5,\r\n    fake_header=True,\r\n    print_stdout=True,\r\n    print_stderr=True,\r\n    allowed_tags=(),\r\n    allowed_attr=(\"ovm-Fixture_Container\",),\r\n    allowed_attr_keys=(\"class\",),\r\n)\r\n\r\n# parse a webpage:\r\ndf = subprocess_parsing(\r\n    [(\"python\", \"https://www.python.org/\")],\r\n    chunks=1,\r\n    processes=5,\r\n    fake_header=True,\r\n    print_stdout=True,\r\n    print_stderr=True,\r\n    allowed_tags=(),\r\n    allowed_attr=(),\r\n    allowed_attr_keys=(),\r\n)\r\n\r\n\r\n\r\n# Generate a column with css selectors \r\n\r\nfrom lxml2pandas import subprocess_parsing,pd_add_generate_css_selector\r\npd_add_generate_css_selector()\r\nhtmldata = [\r\n    (\"bet365\", r\"C:\\Users\\hansc\\Downloads\\bet365 - Apostas Desportivas Online2.mhtml\"),\r\n]\r\ndf = subprocess_parsing(\r\n    htmldata,\r\n    chunks=1,\r\n    processes=5,\r\n    fake_header=True,\r\n    print_stdout=False,\r\n    print_stderr=True,\r\n)\r\ndf = df.s_generate_css_selector()\r\n\r\n```\r\n",
    "bugtrack_url": null,
    "license": "MIT",
    "summary": "lxml to pandas for fast web scraping",
    "version": "0.16",
    "project_urls": {
        "Homepage": "https://github.com/hansalemaos/lxml2pandas"
    },
    "split_keywords": [
        "lxml",
        "pandas",
        "web scraping"
    ],
    "urls": [
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "722675481372f0ae77dbd9da260e0a9f1163899836433caa8094f6056d6a84e7",
                "md5": "e9b670d0e906763f9ade09ea4325692b",
                "sha256": "34990d5e93a6f50dd8cda1781e07a159c2cb386a4b93b3da20a4aa2d59e0208b"
            },
            "downloads": -1,
            "filename": "lxml2pandas-0.16-py3-none-any.whl",
            "has_sig": false,
            "md5_digest": "e9b670d0e906763f9ade09ea4325692b",
            "packagetype": "bdist_wheel",
            "python_version": "py3",
            "requires_python": null,
            "size": 40111,
            "upload_time": "2023-11-19T20:52:08",
            "upload_time_iso_8601": "2023-11-19T20:52:08.789248Z",
            "url": "https://files.pythonhosted.org/packages/72/26/75481372f0ae77dbd9da260e0a9f1163899836433caa8094f6056d6a84e7/lxml2pandas-0.16-py3-none-any.whl",
            "yanked": false,
            "yanked_reason": null
        },
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "5b351bdcb067c75c42d993ff2d65fdc1b85e799c46d73a1e2b276841f06e2744",
                "md5": "a5ac5bd05a4ff9ea6f05a4568d638f83",
                "sha256": "75656182971c424402b1f04f472aed62562e4694b5002662dc0597093451ffe4"
            },
            "downloads": -1,
            "filename": "lxml2pandas-0.16.tar.gz",
            "has_sig": false,
            "md5_digest": "a5ac5bd05a4ff9ea6f05a4568d638f83",
            "packagetype": "sdist",
            "python_version": "source",
            "requires_python": null,
            "size": 40320,
            "upload_time": "2023-11-19T20:52:10",
            "upload_time_iso_8601": "2023-11-19T20:52:10.949009Z",
            "url": "https://files.pythonhosted.org/packages/5b/35/1bdcb067c75c42d993ff2d65fdc1b85e799c46d73a1e2b276841f06e2744/lxml2pandas-0.16.tar.gz",
            "yanked": false,
            "yanked_reason": null
        }
    ],
    "upload_time": "2023-11-19 20:52:10",
    "github": true,
    "gitlab": false,
    "bitbucket": false,
    "codeberg": false,
    "github_user": "hansalemaos",
    "github_project": "lxml2pandas",
    "travis_ci": false,
    "coveralls": false,
    "github_actions": false,
    "requirements": [
        {
            "name": "a_pandas_ex_apply_ignore_exceptions",
            "specs": []
        },
        {
            "name": "fake_headers",
            "specs": []
        },
        {
            "name": "flatten_everything",
            "specs": []
        },
        {
            "name": "lxml",
            "specs": []
        },
        {
            "name": "multiprocnomain",
            "specs": []
        },
        {
            "name": "numpy",
            "specs": []
        },
        {
            "name": "pandas",
            "specs": []
        },
        {
            "name": "requests",
            "specs": []
        }
    ],
    "lcname": "lxml2pandas"
}
        
Elapsed time: 0.13772s