xmlhtml2pandas


Namexmlhtml2pandas JSON
Version 0.14 PyPI version JSON
download
home_pagehttps://github.com/hansalemaos/xmlhtml2pandas
Summaryhtml/ocr parser using Cython/lxml/Tesseract/ImageMagick/Pandas
upload_time2024-07-04 02:58:20
maintainerNone
docs_urlNone
authorJohannes Fischer
requires_pythonNone
licenseMIT
keywords ocr html
VCS
bugtrack_url
requirements No requirements were recorded.
Travis-CI No Travis.
coveralls test coverage No coveralls.
            
# html/ocr parser using Cython/lxml/Tesseract/ImageMagick/Pandas

### Tested against Windows 10 / Python 3.11 / Anaconda / Windows 

### pip install xmlhtml2pandas

### Cython and a C compiler must be installed!

```PY
import os
# Tesseract and ImageMagick must be installed!
os.environ["OMP_THREAD_LIMIT"] = "1"  # to limit the number of threads (tesseract)
os.environ["MAGICK_THREAD_LIMIT"] = "1"  # to limit the number of threads (ImageMagick)
from xmlhtml2pandas import parse_xmlhtml, preprocess_images_and_run_tesseract
from cythondfprint import add_printer  # fast color printer for pandas df

add_printer(1)
for file2parse in [
    r"C:\Users\hansc\Downloads\Apostas Futebol _ Sportingbet.mhtml",
    r"C:\Users\hansc\Downloads\bet365 - Apostas Desportivas Online.mhtml",
    r"C:\Users\hansc\Downloads\bet365 - Apostas Desportivas Online2.mhtml",
]:
    with open(
        file2parse,
        "rb",
    ) as f:
        df_html = parse_xmlhtml(f, "html", ())
        print(df_html)
        print(df_html.dtypes)


for picture in preprocess_images_and_run_tesseract(
    density=200,
    resize_percentage=100,
    tesser_cpus=1,
    image_magick_cpus=1,
    path_in=r"C:\Users\hansc\Desktop\testimg",  # for folders
    path_out=r"C:\Users\hansc\Desktop\testimg_outfiles",  #  for folders
    # path_in=r"C:\Users\hansc\Downloads\apicture.png",# single file
    # path_out=r"C:\Users\hansc\Downloads\afolderforapicture", # single file - folder as output
    magick_options="""-colorspace LinearGray  -normalize -auto-level -alpha deactivate  -adaptive-blur 1 -adaptive-sharpen 1 -trim -fuzz 60 -antialias -auto-gamma -auto-level -black-point-compensation -normalize -enhance -white-balance -antialias -black-threshold 4 -mean-shift 1x5+17%""",
    magick_path=r"C:\Program Files\ImageMagick-7.1.1-Q16-HDRI\magick.exe",
    tesseractpath=r"C:\Program Files\Tesseract-OCR\tesseract.exe",
    tessdata_dir=r"C:\Program Files\Tesseract-OCR\tessdata",
    tesser_options_str="-l por+eng --oem 3 --psm 6 -c tessedit_create_hocr=1 -c hocr_font_info=1 -c tessedit_pageseg_mode=6",
    debug=False,
    subprocess_kwargs_tesser=None,
    subprocess_kwargs_magick=None,
    include_screenshots=True,
):
    print(picture)

# on android 

import os 
import subprocess 
os.environ["OMP_THREAD_LIMIT"] = "1"  
os.environ["MAGICK_THREAD_LIMIT"] = "1" 
os.environ["KMP_ALL_THREADS"] = "1"   
os.environ["KMP_TEAMS_THREAD_LIMIT"] = "1" 
os.environ["OMP_THREAD_LIMIT"] = "1"  
os.environ["KMP_DEVICE_THREAD_LIMIT"] = "1" 

from xmlhtml2pandas import parse_xmlhtml, preprocess_images_and_run_tesseract
subprocess.run("screencap -p > /sdcard/shot.png",shell=True)
for picture in preprocess_images_and_run_tesseract(
    density=200,
    resize_percentage=100,
    tesser_cpus=1,
    image_magick_cpus=1,
    path_in=r"/sdcard/shot.png", 
    path_out=r"/sdcard/Downloadsout",
    magick_options="""-colorspace LinearGray  -normalize -auto-level -alpha deactivate  -adaptive-blur 1 -adaptive-sharpen 1 -trim -fuzz 60 -antialias -auto-gamma -auto-level -black-point-compensation -normalize -enhance -white-balance -antialias -black-threshold 4 -mean-shift 1x5+17%""",
    magick_path=r"/data/data/com.termux/files/usr/bin/magick",
    tesseractpath=r"/data/data/com.termux/files/usr/bin/tesseract",
    tessdata_dir=r"/data/data/com.termux/files/usr/share/tessdata_fast",
    tesser_options_str="-l por+eng --oem 3 --psm 6 -c tessedit_create_hocr=1 -c hocr_font_info=1 -c tessedit_pageseg_mode=6",
    debug=False,
    subprocess_kwargs_tesser=None,
    subprocess_kwargs_magick=None,
    include_screenshots=False,
):
    print(picture)
```

            

Raw data

            {
    "_id": null,
    "home_page": "https://github.com/hansalemaos/xmlhtml2pandas",
    "name": "xmlhtml2pandas",
    "maintainer": null,
    "docs_url": null,
    "requires_python": null,
    "maintainer_email": null,
    "keywords": "ocr, html",
    "author": "Johannes Fischer",
    "author_email": "aulasparticularesdealemaosp@gmail.com",
    "download_url": "https://files.pythonhosted.org/packages/d4/da/93ae71fd08abc74d2b85bc7c4b48e99a0baa673628a1207cecda9afd3836/xmlhtml2pandas-0.14.tar.gz",
    "platform": null,
    "description": "\r\n# html/ocr parser using Cython/lxml/Tesseract/ImageMagick/Pandas\r\n\r\n### Tested against Windows 10 / Python 3.11 / Anaconda / Windows \r\n\r\n### pip install xmlhtml2pandas\r\n\r\n### Cython and a C compiler must be installed!\r\n\r\n```PY\r\nimport os\r\n# Tesseract and ImageMagick must be installed!\r\nos.environ[\"OMP_THREAD_LIMIT\"] = \"1\"  # to limit the number of threads (tesseract)\r\nos.environ[\"MAGICK_THREAD_LIMIT\"] = \"1\"  # to limit the number of threads (ImageMagick)\r\nfrom xmlhtml2pandas import parse_xmlhtml, preprocess_images_and_run_tesseract\r\nfrom cythondfprint import add_printer  # fast color printer for pandas df\r\n\r\nadd_printer(1)\r\nfor file2parse in [\r\n    r\"C:\\Users\\hansc\\Downloads\\Apostas Futebol _ Sportingbet.mhtml\",\r\n    r\"C:\\Users\\hansc\\Downloads\\bet365 - Apostas Desportivas Online.mhtml\",\r\n    r\"C:\\Users\\hansc\\Downloads\\bet365 - Apostas Desportivas Online2.mhtml\",\r\n]:\r\n    with open(\r\n        file2parse,\r\n        \"rb\",\r\n    ) as f:\r\n        df_html = parse_xmlhtml(f, \"html\", ())\r\n        print(df_html)\r\n        print(df_html.dtypes)\r\n\r\n\r\nfor picture in preprocess_images_and_run_tesseract(\r\n    density=200,\r\n    resize_percentage=100,\r\n    tesser_cpus=1,\r\n    image_magick_cpus=1,\r\n    path_in=r\"C:\\Users\\hansc\\Desktop\\testimg\",  # for folders\r\n    path_out=r\"C:\\Users\\hansc\\Desktop\\testimg_outfiles\",  #  for folders\r\n    # path_in=r\"C:\\Users\\hansc\\Downloads\\apicture.png\",# single file\r\n    # path_out=r\"C:\\Users\\hansc\\Downloads\\afolderforapicture\", # single file - folder as output\r\n    magick_options=\"\"\"-colorspace LinearGray  -normalize -auto-level -alpha deactivate  -adaptive-blur 1 -adaptive-sharpen 1 -trim -fuzz 60 -antialias -auto-gamma -auto-level -black-point-compensation -normalize -enhance -white-balance -antialias -black-threshold 4 -mean-shift 1x5+17%\"\"\",\r\n    magick_path=r\"C:\\Program Files\\ImageMagick-7.1.1-Q16-HDRI\\magick.exe\",\r\n    tesseractpath=r\"C:\\Program Files\\Tesseract-OCR\\tesseract.exe\",\r\n    tessdata_dir=r\"C:\\Program Files\\Tesseract-OCR\\tessdata\",\r\n    tesser_options_str=\"-l por+eng --oem 3 --psm 6 -c tessedit_create_hocr=1 -c hocr_font_info=1 -c tessedit_pageseg_mode=6\",\r\n    debug=False,\r\n    subprocess_kwargs_tesser=None,\r\n    subprocess_kwargs_magick=None,\r\n    include_screenshots=True,\r\n):\r\n    print(picture)\r\n\r\n# on android \r\n\r\nimport os \r\nimport subprocess \r\nos.environ[\"OMP_THREAD_LIMIT\"] = \"1\"  \r\nos.environ[\"MAGICK_THREAD_LIMIT\"] = \"1\" \r\nos.environ[\"KMP_ALL_THREADS\"] = \"1\"   \r\nos.environ[\"KMP_TEAMS_THREAD_LIMIT\"] = \"1\" \r\nos.environ[\"OMP_THREAD_LIMIT\"] = \"1\"  \r\nos.environ[\"KMP_DEVICE_THREAD_LIMIT\"] = \"1\" \r\n\r\nfrom xmlhtml2pandas import parse_xmlhtml, preprocess_images_and_run_tesseract\r\nsubprocess.run(\"screencap -p > /sdcard/shot.png\",shell=True)\r\nfor picture in preprocess_images_and_run_tesseract(\r\n    density=200,\r\n    resize_percentage=100,\r\n    tesser_cpus=1,\r\n    image_magick_cpus=1,\r\n    path_in=r\"/sdcard/shot.png\", \r\n    path_out=r\"/sdcard/Downloadsout\",\r\n    magick_options=\"\"\"-colorspace LinearGray  -normalize -auto-level -alpha deactivate  -adaptive-blur 1 -adaptive-sharpen 1 -trim -fuzz 60 -antialias -auto-gamma -auto-level -black-point-compensation -normalize -enhance -white-balance -antialias -black-threshold 4 -mean-shift 1x5+17%\"\"\",\r\n    magick_path=r\"/data/data/com.termux/files/usr/bin/magick\",\r\n    tesseractpath=r\"/data/data/com.termux/files/usr/bin/tesseract\",\r\n    tessdata_dir=r\"/data/data/com.termux/files/usr/share/tessdata_fast\",\r\n    tesser_options_str=\"-l por+eng --oem 3 --psm 6 -c tessedit_create_hocr=1 -c hocr_font_info=1 -c tessedit_pageseg_mode=6\",\r\n    debug=False,\r\n    subprocess_kwargs_tesser=None,\r\n    subprocess_kwargs_magick=None,\r\n    include_screenshots=False,\r\n):\r\n    print(picture)\r\n```\r\n",
    "bugtrack_url": null,
    "license": "MIT",
    "summary": "html/ocr parser using Cython/lxml/Tesseract/ImageMagick/Pandas",
    "version": "0.14",
    "project_urls": {
        "Homepage": "https://github.com/hansalemaos/xmlhtml2pandas"
    },
    "split_keywords": [
        "ocr",
        " html"
    ],
    "urls": [
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "40a7671e769ebd4e1dafdb872f618ab3f6f625af8fe6f1470f0f185091ab5ea0",
                "md5": "d6d6638837c47146475d88ba5fc6a2a9",
                "sha256": "3a60b80fd1551256a4df4a1f4906b11ea13a120c17a69ad79a742296252c10da"
            },
            "downloads": -1,
            "filename": "xmlhtml2pandas-0.14-py3-none-any.whl",
            "has_sig": false,
            "md5_digest": "d6d6638837c47146475d88ba5fc6a2a9",
            "packagetype": "bdist_wheel",
            "python_version": "py3",
            "requires_python": null,
            "size": 32608,
            "upload_time": "2024-07-04T02:58:18",
            "upload_time_iso_8601": "2024-07-04T02:58:18.524097Z",
            "url": "https://files.pythonhosted.org/packages/40/a7/671e769ebd4e1dafdb872f618ab3f6f625af8fe6f1470f0f185091ab5ea0/xmlhtml2pandas-0.14-py3-none-any.whl",
            "yanked": false,
            "yanked_reason": null
        },
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "d4da93ae71fd08abc74d2b85bc7c4b48e99a0baa673628a1207cecda9afd3836",
                "md5": "89ce7465fc41973939d6829778109e45",
                "sha256": "2a886f61968e593475a0f6c9c9ec25cbf3519fb6102744f35d3f548739771d57"
            },
            "downloads": -1,
            "filename": "xmlhtml2pandas-0.14.tar.gz",
            "has_sig": false,
            "md5_digest": "89ce7465fc41973939d6829778109e45",
            "packagetype": "sdist",
            "python_version": "source",
            "requires_python": null,
            "size": 32217,
            "upload_time": "2024-07-04T02:58:20",
            "upload_time_iso_8601": "2024-07-04T02:58:20.074827Z",
            "url": "https://files.pythonhosted.org/packages/d4/da/93ae71fd08abc74d2b85bc7c4b48e99a0baa673628a1207cecda9afd3836/xmlhtml2pandas-0.14.tar.gz",
            "yanked": false,
            "yanked_reason": null
        }
    ],
    "upload_time": "2024-07-04 02:58:20",
    "github": true,
    "gitlab": false,
    "bitbucket": false,
    "codeberg": false,
    "github_user": "hansalemaos",
    "github_project": "xmlhtml2pandas",
    "travis_ci": false,
    "coveralls": false,
    "github_actions": false,
    "requirements": [],
    "lcname": "xmlhtml2pandas"
}
        
Elapsed time: 0.38147s