NLPX


NameNLPX JSON
Version 1.8.1 PyPI version JSON
download
home_pagehttps://gitee.com/summry/nlpx
SummaryA tool set for NLP. Text classification. Trainer. Tokenizer
upload_time2024-12-18 16:36:48
maintainerNone
docs_urlNone
authorsummy
requires_python>=3.6
licenseNone
keywords nlp nlp ai llm gpt machine learning deep learning tokenize torch
VCS
bugtrack_url
requirements No requirements were recorded.
Travis-CI No Travis.
coveralls test coverage No coveralls.
            Usage Sample
''''''''''''

.. code:: python

        import torch
        from sklearn.model_selection import train_test_split
        from nlpx.text_token import Tokenizer
        from nlpx.model.classifier import TextCNNClassifier
        from nlpx.model.wrapper import ClassModelWrapper
        from nlpx.dataset import TokenDataset, PaddingTokenCollator

        if __name__ == '__main__':
            classes = ['class1', 'class2', 'class3'...]
            texts = [[str],]
            labels = [0, 0, 1, 2, 1...]
            tokenizer = Tokenizer.from_texts(texts, min_freq=5)
            sent = 'I love you'
            tokens = tokenizer.encode(sent, max_length=6)
            # [101, 66, 88, 99, 102, 0]
            sent = tokenizer.decode(tokens)
            # ['<BOS>', 'I', 'love', 'you', '<EOS>', '<PAD>']

            tokens = tokenizer.batch_encode(texts, padding=False)
            X_train, X_test, y_train, y_test = train_test_split(tokens, labels, test_size=0.2)
            train_set = TokenDataset(X_train, y_train)
            val_set = TokenDataset(X_test, y_test)

            model = TextCNNClassifier(embed_dim=128, vocab_size=tokenizer.vocab_size, num_classes=len(classes))
            model_wrapper = ClassModelWrapper(model, classes=classes)
            model_wrapper.train(train_set, val_set, show_progress=True, collate_fn=PaddingTokenCollator(tokenizer.pad))

            result = model_wrapper.evaluate(val_set, collate_fn=PaddingTokenCollator(tokenizer.pad))
            # 0.953125

            test_inputs = torch.tensor(test_tokens, dtype=torch.long)
            result = model_wrapper.predict(test_inputs)
            # [0, 1]

            result = model_wrapper.predict_classes(test_inputs)
            # ['class1', 'class2']

            result = model_wrapper.predict_proba(test_inputs)
            # ([0, 1], array([0.99439645, 0.99190724], dtype=float32))

            result = model_wrapper.predict_classes_proba(test_inputs)
            # (['class1', 'class2'], array([0.99439645, 0.99190724], dtype=float32))



            

Raw data

            {
    "_id": null,
    "home_page": "https://gitee.com/summry/nlpx",
    "name": "NLPX",
    "maintainer": null,
    "docs_url": null,
    "requires_python": ">=3.6",
    "maintainer_email": null,
    "keywords": "NLP, nlp, AI, llm, GPT, Machine learning, Deep learning, tokenize, torch",
    "author": "summy",
    "author_email": "xiazhongbiao@126.com",
    "download_url": "https://files.pythonhosted.org/packages/30/72/eef6a8441b478e1b30d6beeaf9ac6863a0da01518838f438b54e45bbbb80/NLPX-1.8.1.tar.gz",
    "platform": null,
    "description": "Usage Sample\n''''''''''''\n\n.. code:: python\n\n        import torch\n        from sklearn.model_selection import train_test_split\n        from nlpx.text_token import Tokenizer\n        from nlpx.model.classifier import TextCNNClassifier\n        from nlpx.model.wrapper import ClassModelWrapper\n        from nlpx.dataset import TokenDataset, PaddingTokenCollator\n\n        if __name__ == '__main__':\n            classes = ['class1', 'class2', 'class3'...]\n            texts = [[str],]\n            labels = [0, 0, 1, 2, 1...]\n            tokenizer = Tokenizer.from_texts(texts, min_freq=5)\n            sent = 'I love you'\n            tokens = tokenizer.encode(sent, max_length=6)\n            # [101, 66, 88, 99, 102, 0]\n            sent = tokenizer.decode(tokens)\n            # ['<BOS>', 'I', 'love', 'you', '<EOS>', '<PAD>']\n\n            tokens = tokenizer.batch_encode(texts, padding=False)\n            X_train, X_test, y_train, y_test = train_test_split(tokens, labels, test_size=0.2)\n            train_set = TokenDataset(X_train, y_train)\n            val_set = TokenDataset(X_test, y_test)\n\n            model = TextCNNClassifier(embed_dim=128, vocab_size=tokenizer.vocab_size, num_classes=len(classes))\n            model_wrapper = ClassModelWrapper(model, classes=classes)\n            model_wrapper.train(train_set, val_set, show_progress=True, collate_fn=PaddingTokenCollator(tokenizer.pad))\n\n            result = model_wrapper.evaluate(val_set, collate_fn=PaddingTokenCollator(tokenizer.pad))\n            # 0.953125\n\n            test_inputs = torch.tensor(test_tokens, dtype=torch.long)\n            result = model_wrapper.predict(test_inputs)\n            # [0, 1]\n\n            result = model_wrapper.predict_classes(test_inputs)\n            # ['class1', 'class2']\n\n            result = model_wrapper.predict_proba(test_inputs)\n            # ([0, 1], array([0.99439645, 0.99190724], dtype=float32))\n\n            result = model_wrapper.predict_classes_proba(test_inputs)\n            # (['class1', 'class2'], array([0.99439645, 0.99190724], dtype=float32))\n\n\n",
    "bugtrack_url": null,
    "license": null,
    "summary": "A tool set for NLP. Text classification. Trainer. Tokenizer",
    "version": "1.8.1",
    "project_urls": {
        "Homepage": "https://gitee.com/summry/nlpx"
    },
    "split_keywords": [
        "nlp",
        " nlp",
        " ai",
        " llm",
        " gpt",
        " machine learning",
        " deep learning",
        " tokenize",
        " torch"
    ],
    "urls": [
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "3072eef6a8441b478e1b30d6beeaf9ac6863a0da01518838f438b54e45bbbb80",
                "md5": "2a8755751b0bcb5a4da8750f51eff952",
                "sha256": "e139e42e82678d07ba3fbf2beae32dc0e5daaa2c1f2a833ae6e100ccca09abc3"
            },
            "downloads": -1,
            "filename": "NLPX-1.8.1.tar.gz",
            "has_sig": false,
            "md5_digest": "2a8755751b0bcb5a4da8750f51eff952",
            "packagetype": "sdist",
            "python_version": "source",
            "requires_python": ">=3.6",
            "size": 29254,
            "upload_time": "2024-12-18T16:36:48",
            "upload_time_iso_8601": "2024-12-18T16:36:48.460358Z",
            "url": "https://files.pythonhosted.org/packages/30/72/eef6a8441b478e1b30d6beeaf9ac6863a0da01518838f438b54e45bbbb80/NLPX-1.8.1.tar.gz",
            "yanked": false,
            "yanked_reason": null
        }
    ],
    "upload_time": "2024-12-18 16:36:48",
    "github": false,
    "gitlab": false,
    "bitbucket": false,
    "codeberg": false,
    "lcname": "nlpx"
}
        
Elapsed time: 0.45381s