# nahiarhdNLP - Indonesian Natural Language Processing Library
Library Indonesian Natural Language Processing dengan fitur preprocessing teks, normalisasi slang, konversi emoji, koreksi ejaan, dan berbagai fungsi text processing lainnya.
## ๐ Instalasi
```bash
pip install nahiarhdNLP
```
## ๐ฆ Import Library
```python
# Import functions dari preprocessing
from nahiarhdNLP.preprocessing import (
# Fungsi pembersihan dasar
remove_html, remove_emoji, remove_url, remove_mentions, remove_hashtags,
remove_numbers, remove_punctuation, remove_extra_spaces,
remove_special_chars, remove_whitespace, to_lowercase,
# Fungsi normalisasi dan koreksi
replace_spell_corrector, replace_repeated_chars,
# Fungsi emoji
emoji_to_words, words_to_emoji,
# Fungsi linguistic
remove_stopwords, stem_text, tokenize,
# Fungsi pipeline
pipeline, preprocess, Pipeline
)
# Import kelas untuk penggunaan advanced
from nahiarhdNLP.preprocessing import (
TextCleaner, SpellCorrector, StopwordRemover,
Stemmer, EmojiConverter, Tokenizer
)
# Import dataset loader
from nahiarhdNLP.datasets import DatasetLoader
```
## ๐ Contoh Penggunaan
### 1. ๐งน TextCleaner - Membersihkan Teks
```python
from nahiarhdNLP.preprocessing import TextCleaner
cleaner = TextCleaner()
# Membersihkan HTML tags
html_text = "website <a href='https://google.com'>google</a>"
clean_result = cleaner.clean_html(html_text)
print(clean_result)
# Output: "website google"
# Membersihkan URL
url_text = "kunjungi https://google.com sekarang!"
clean_result = cleaner.clean_urls(url_text)
print(clean_result)
# Output: "kunjungi sekarang!"
# Membersihkan mentions
mention_text = "Halo @user123 apa kabar?"
clean_result = cleaner.clean_mentions(mention_text)
print(clean_result)
# Output: "Halo apa kabar?"
# Membersihkan emoji
emoji_text = "Halo dunia ๐๐ apa kabar? ๐"
clean_result = cleaner.clean_emoji(emoji_text)
print(clean_result)
# Output: "Halo dunia apa kabar?"
```
### 2. โ๏ธ SpellCorrector - Koreksi Ejaan & Normalisasi Slang
```python
from nahiarhdNLP.preprocessing import SpellCorrector
spell = SpellCorrector()
# Koreksi kata salah eja
word = "sya"
corrected = spell.correct_word(word)
print(corrected)
# Output: "saya"
# Koreksi kalimat lengkap (termasuk normalisasi slang)
sentence = "sya suka mkn nasi"
corrected = spell.correct_sentence(sentence)
print(corrected)
# Output: "saya suka makan nasi"
# Normalisasi slang
slang_text = "gw lg di rmh"
normalized = spell.correct_sentence(slang_text)
print(normalized)
# Output: "gue lagi di rumah"
```
### 3. ๐ซ StopwordRemover - Menghapus Stopwords
```python
from nahiarhdNLP.preprocessing import StopwordRemover
stopword = StopwordRemover()
stopword._load_data() # Load dataset stopwords
# Menghapus stopwords
text = "saya suka makan nasi goreng"
result = stopword.remove_stopwords(text)
print(result)
# Output: "suka makan nasi goreng"
# Cek apakah kata adalah stopword
is_stop = stopword.is_stopword("adalah")
print(is_stop) # True
```
### 4. ๐ EmojiConverter - Konversi Emoji
```python
from nahiarhdNLP.preprocessing import EmojiConverter
emoji = EmojiConverter()
emoji._load_data() # Load dataset emoji
# Emoji ke teks
emoji_text = "๐ ๐ ๐"
text_result = emoji.emoji_to_text_convert(emoji_text)
print(text_result)
# Output: "wajah_gembira wajah_gembira_berurai_air_mata wajah_tersenyum_lebar_bermata_hati"
# Teks ke emoji
text = "wajah_gembira"
emoji_result = emoji.text_to_emoji_convert(text)
print(emoji_result)
# Output: "๐"
```
### 5. ๐ช Tokenizer - Tokenisasi
```python
from nahiarhdNLP.preprocessing import Tokenizer
tokenizer = Tokenizer()
# Tokenisasi teks
text = "Saya suka makan nasi"
tokens = tokenizer.tokenize(text)
print(tokens)
# Output: ['Saya', 'suka', 'makan', 'nasi']
```
### 6. ๐ฟ Stemmer - Stemming
```python
from nahiarhdNLP.preprocessing import Stemmer
try:
stemmer = Stemmer()
text = "bermain-main dengan senang"
result = stemmer.stem(text)
print(result)
# Output: "main main dengan senang"
except ImportError:
print("Install Sastrawi dengan: pip install Sastrawi")
```
### 7. ๐ ๏ธ Fungsi Individual
```python
from nahiarhdNLP.preprocessing import (
remove_html, remove_emoji, remove_url, remove_mentions, remove_hashtags,
remove_numbers, remove_punctuation, remove_extra_spaces,
remove_special_chars, remove_whitespace, to_lowercase,
replace_spell_corrector, replace_repeated_chars,
emoji_to_words, words_to_emoji, remove_stopwords,
stem_text, tokenize
)
# ๐งน FUNGSI PEMBERSIHAN DASAR
# Menghapus HTML tags
html_text = "website <a href='https://google.com'>google</a>"
clean_result = remove_html(html_text)
print(clean_result)
# Output: "website google"
# Menghapus emoji
emoji_text = "Halo dunia ๐๐ apa kabar? ๐"
clean_result = remove_emoji(emoji_text)
print(clean_result)
# Output: "Halo dunia apa kabar?"
# Menghapus URL
url_text = "kunjungi https://google.com sekarang!"
clean_result = remove_url(url_text)
print(clean_result)
# Output: "kunjungi sekarang!"
# Menghapus mentions (@username)
mention_text = "Halo @user123 dan @admin apa kabar?"
clean_result = remove_mentions(mention_text)
print(clean_result)
# Output: "Halo dan apa kabar?"
# Menghapus hashtags (#tag)
hashtag_text = "Hari ini #senin #libur #weekend"
clean_result = remove_hashtags(hashtag_text)
print(clean_result)
# Output: "Hari ini"
# โจ FUNGSI NORMALISASI DAN KOREKSI
# Normalisasi slang dan koreksi ejaan
slang_text = "emg siapa yg nanya?"
normal_text = replace_spell_corrector(slang_text)
print(normal_text)
# Output: "memang siapa yang bertanya?"
# Mengatasi perpanjangan kata (word elongation)
elongation_text = "kenapaaa???"
clean_result = replace_repeated_chars(elongation_text)
print(clean_result)
# Output: "kenapaa??"
# ๐ FUNGSI EMOJI
# Konversi emoji ke kata
emoji_text = "emoji ๐๐"
text_result = emoji_to_words(emoji_text)
print(text_result)
# Output: "emoji wajah_gembira wajah_gembira_dengan_mata_bahagia"
# Konversi kata ke emoji
text_to_emoji = "emoji wajah_gembira"
emoji_result = words_to_emoji(text_to_emoji)
print(emoji_result)
# Output: "emoji ๐"
# ๐ฌ FUNGSI LINGUISTIC
# Menghapus stopwords
stopword_text = "siapa yang suruh makan?!!"
clean_result = remove_stopwords(stopword_text)
print(clean_result)
# Output: "suruh makan?!!"
# Stemming teks (memerlukan Sastrawi)
try:
stem_text_input = "bermain-main dengan senang"
stemmed = stem_text(stem_text_input)
print(stemmed)
# Output: "main main dengan senang"
except ImportError:
print("Install Sastrawi: pip install Sastrawi")
# Tokenisasi teks
tokenize_text = "Saya suka makan nasi"
tokens = tokenize(tokenize_text)
print(tokens)
# Output: ['Saya', 'suka', 'makan', 'nasi']
```
### 8. ๐ Pipeline - Preprocessing Sekaligus
Pipeline mendukung **dua cara penggunaan**:
#### A. ๐ Pipeline dengan Functions (Simple & Clean)
```python
from nahiarhdNLP.preprocessing import Pipeline, remove_html, remove_url, remove_mentions, to_lowercase
# Langsung pass functions yang mau dipakai
pipeline = Pipeline(remove_html, remove_url, remove_mentions)
result = pipeline.process("Hello <b>world</b> @user https://example.com")
print(result)
# Output: "Hello world"
# Bebas pilih functions sesuai kebutuhan
pipeline = Pipeline(remove_url, replace_spell_corrector, to_lowercase)
result = pipeline.process("Halooo https://google.com gw lg nyari info")
print(result)
# Output: "halooo gue lagi mencari info"
# Pipeline bisa dipanggil langsung seperti function
result = pipeline("Test text lainnya")
print(result)
# Contoh untuk social media text
social_pipeline = Pipeline(
remove_mentions,
remove_hashtags,
remove_emoji,
remove_url,
replace_spell_corrector,
to_lowercase
)
result = social_pipeline.process("Halooo @user #trending ๐ https://example.com gw lg nyari info")
print(result)
# Output: "halooo gue lagi mencari info"
# Tokenisasi juga bisa langsung
token_pipeline = Pipeline(remove_url, to_lowercase, tokenize)
tokens = token_pipeline.process("Hello https://google.com World")
print(tokens) # ['hello', 'world']
```
#### B. ๐ฏ Pipeline dengan Config Dictionary (Advanced)
```python
from nahiarhdNLP.preprocessing import Pipeline
# Config dictionary untuk kontrol detail
config = {
"remove_emoji": True,
"remove_url": True,
"remove_mentions": True,
"remove_hashtags": True,
"remove_numbers": True,
"replace_spell_corrector": True,
"to_lowercase": True,
"remove_punctuation": True,
}
pipeline = Pipeline(config)
result = pipeline.process("Halooo @user123 #trending https://example.com gw lg nyari info pnting ๐!!! 123")
print(result)
# Output: "halo gue lagi mencari info penting ๐!!! 123"
# Pipeline dengan tokenisasi
tokenize_config = {
"remove_url": True,
"remove_mentions": True,
"replace_spell_corrector": True,
"to_lowercase": True,
"tokenize": True,
}
pipe = Pipeline(tokenize_config)
result = pipe.process("gw suka makan nasi @user")
print(result)
# Output: ['gue', 'suka', 'makan', 'nasi']
# Advanced features untuk config mode
print("Current config:", pipeline.get_config())
print("Enabled steps:", pipeline.get_enabled_steps())
# Update configuration
pipeline.update_config({"tokenize": True, "remove_stopwords": True})
```
#### C. ๐ง Helper Function pipeline()
```python
from nahiarhdNLP.preprocessing import pipeline
# Preprocessing langsung dengan config
config = {"remove_url": True, "replace_spell_corrector": True, "to_lowercase": True}
result = pipeline("Gw lg browsing https://google.com", config)
print(result)
# Output: "gue lagi rosin"
```
#### ๐ Available Functions untuk Pipeline
```python
# Basic cleaning
remove_html, remove_emoji, remove_url, remove_mentions, remove_hashtags,
remove_numbers, remove_punctuation, remove_special_chars,
remove_whitespace, remove_extra_spaces
# Text transformation
to_lowercase, replace_repeated_chars, replace_spell_corrector
# Emoji handling
emoji_to_words, words_to_emoji
# Linguistic processing
remove_stopwords, stem_text, tokenize
```
### 9. ๐๏ธ Preprocess Function (Backward Compatibility)
```python
from nahiarhdNLP.preprocessing import preprocess
# Preprocessing dengan parameter eksplisit
result = preprocess(
"Halooo @user!!! 123 ๐",
remove_emoji=True,
remove_mentions=True,
remove_numbers=True,
remove_punctuation=True,
replace_repeated_chars=True,
to_lowercase=True,
replace_spell_corrector=False,
)
print(result)
# Output: "haloo !! 123"
```
### 10. ๐ Dataset Loader
```python
from nahiarhdNLP.datasets import DatasetLoader
loader = DatasetLoader()
# Load stopwords dari CSV lokal
stopwords = loader.load_stopwords_dataset()
print(f"Jumlah stopwords: {len(stopwords)}")
# Load slang dictionary dari CSV lokal
slang_dict = loader.load_slang_dataset()
print(f"Jumlah slang: {len(slang_dict)}")
# Load emoji dictionary dari CSV lokal
emoji_dict = loader.load_emoji_dataset()
print(f"Jumlah emoji: {len(emoji_dict)}")
# Load wordlist dari JSON lokal
wordlist = loader.load_wordlist_dataset()
print(f"Jumlah kata: {len(wordlist)}")
```
> **Catatan:** Semua dataset (stopword, slang, emoji, wordlist) di-load langsung dari file CSV/JSON di folder `nahiarhdNLP/datasets/`. Tidak ada proses download dari external source.
## ๐ฅ Demo Script
Untuk melihat semua fitur library bekerja:
```bash
python -m nahiarhdNLP.demo
```
Demo ini menunjukkan:
- โ
Semua fungsi individual utility
- โ
Penggunaan class-based approach
- โ
Pipeline system (functions & config)
- โ
Advanced pipeline features
- โ
Handling error dan troubleshooting
## ๐จ Error Handling
```python
try:
from nahiarhdNLP.preprocessing import SpellCorrector
spell = SpellCorrector()
result = spell.correct_sentence("test")
except ImportError:
print("Package nahiarhdNLP belum terinstall")
print("Install dengan: pip install nahiarhdNLP")
except Exception as e:
print(f"Error: {e}")
```
## ๐ก Tips Penggunaan
1. **Untuk preprocessing simple**: Gunakan `Pipeline(function1, function2, ...)` - langsung pass functions!
2. **Untuk kontrol detail**: Gunakan `Pipeline(config_dict)` atau `preprocess()` dengan parameter boolean
3. **Untuk kontrol penuh**: Gunakan kelas individual (`TextCleaner`, `SpellCorrector`, dll)
4. **Untuk spell correction + slang**: Gunakan `SpellCorrector` yang menggabungkan kedua fitur
5. **Untuk menghapus emoji**: Gunakan `remove_emoji()` atau set `remove_emoji=True` di Pipeline/preprocess
6. **Untuk stemming**: Install Sastrawi terlebih dahulu: `pip install Sastrawi`
7. **Untuk load dataset**: Gunakan `DatasetLoader` dari `nahiarhdNLP.datasets`
8. **Untuk inisialisasi kelas**: Panggil `_load_data()` untuk kelas yang memerlukan dataset
9. **Pipeline design**: `Pipeline(remove_url, to_lowercase)` lebih jelas daripada config dictionary
10. **Function chaining**: Pipeline bisa dipanggil seperti function dengan `pipeline("text")`
11. **Demo testing**: Jalankan `python -m nahiarhdNLP.demo` untuk melihat semua fitur bekerja
## โก Performance & Dataset
nahiarhdNLP menggunakan **dataset lokal** yang sudah disediakan:
- **Stopwords**: File `stop_word.csv` (788 kata)
- **Slang Dictionary**: File `slang.csv` (15,675 pasangan)
- **Emoji Mapping**: File `emoji.csv` (3,530 emoji)
- **Wordlist**: File `wordlist.json` (kamus kata Indonesia)
- **KBBI Dictionary**: File `kata_dasar_kbbi.csv` (28,527 kata)
- **Kamus Tambahan**: File `kamus.txt` (30,871 kata)
Semua dataset tersimpan di folder `nahiarhdNLP/datasets/` dan diakses melalui `DatasetLoader`.
## ๐ฆ Dependencies
Package ini membutuhkan:
- `pandas` - untuk load dan proses dataset CSV/JSON
- `Sastrawi` - untuk stemming (opsional)
- `rich` - untuk output formatting di demo (opsional)
## ๐ง Struktur Modul
```text
nahiarhdNLP/
โโโ datasets/
โ โโโ loaders.py # DatasetLoader class
โ โโโ emoji.csv # Dataset emoji (3,530 entries)
โ โโโ slang.csv # Dataset slang (15,675 entries)
โ โโโ stop_word.csv # Dataset stopwords (788 entries)
โ โโโ wordlist.json # Dataset wordlist
โ โโโ kata_dasar_kbbi.csv # Dataset KBBI (28,527 entries)
โ โโโ kamus.txt # Dataset kamus tambahan (30,871 entries)
โโโ preprocessing/
โ โโโ cleaning/
โ โ โโโ text_cleaner.py # TextCleaner class
โ โโโ linguistic/
โ โ โโโ stemmer.py # Stemmer class
โ โ โโโ stopwords.py # StopwordRemover class
โ โโโ normalization/
โ โ โโโ emoji.py # EmojiConverter class
โ โ โโโ spell_corrector.py # SpellCorrector class
โ โโโ tokenization/
โ โ โโโ tokenizer.py # Tokenizer class
โ โโโ utils.py # Fungsi utility individual & Pipeline
โโโ demo.py # File demo penggunaan
```
## ๐ Changelog Versi 1.5.0
- ๐ **[FITUR BARU]** Menambahkan `remove_emoji()` function untuk menghapus emoji dari teks
- โ
**[BARU]** TextCleaner sekarang memiliki method `clean_emoji()` untuk menghapus emoji
- โ
**[BARU]** Pipeline mendukung "remove_emoji" config untuk emoji removal
- โ
**[BARU]** Preprocess function mendukung parameter `remove_emoji=True/False`
- โ
**[PERBAIKAN]** Demo script diperbarui dengan contoh emoji removal
- โ
**[PERBAIKAN]** Dokumentasi lengkap untuk fitur emoji removal
- ๐ **[MAJOR]** Pipeline sekarang mendukung 2 mode: Functions dan Config Dictionary
- โ
**[BARU]** Pipeline dengan functions: `Pipeline(remove_url, to_lowercase)`
- โ
**[BARU]** Pipeline dengan config: `Pipeline({"remove_url": True, "to_lowercase": True})`
- โ
**[BARU]** Advanced pipeline features: `get_config()`, `get_enabled_steps()`, `update_config()`
- โ
**[PERBAIKAN]** Fungsi `pipeline(text, config)` sekarang bekerja dengan config dictionary
- โ
**[PERBAIKAN]** TextCleaner sekarang punya method `clean_html()` yang benar
- โ
**[PERBAIKAN]** SpellCorrector demo diperbaiki dengan proper instantiation
- โ
**[PERBAIKAN]** Demo script berjalan sempurna tanpa error
- โ
**[PERBAIKAN]** Dokumentasi yang akurat dan sesuai implementasi
- โ
**[PERBAIKAN]** Function names yang konsisten: `replace_spell_corrector`, `replace_repeated_chars`
- โ
**[PERBAIKAN]** Backward compatibility dengan `preprocess()` function
- โ
Menggabungkan spell correction dan slang normalization dalam `SpellCorrector`
- โ
Semua dataset menggunakan file lokal (CSV/JSON)
- โ
Struktur yang lebih terorganisir dengan pemisahan kelas dan fungsi
- โ
Penambahan `DatasetLoader` untuk manajemen dataset terpusat
- โ
Dataset lengkap dengan 6 file berbeda (emoji, slang, stopwords, wordlist, KBBI, kamus)
## ๐ Troubleshooting
**Error saat import dataset:**
```python
# Pastikan memanggil _load_data() untuk kelas yang memerlukan dataset
stopword = StopwordRemover()
stopword._load_data() # Penting!
```
**Error Sastrawi tidak ditemukan:**
```bash
pip install Sastrawi
```
**Error pandas tidak ditemukan:**
```bash
pip install pandas
```
**Testing semua fitur:**
```bash
python -m nahiarhdNLP.demo
```
## ๐ License
MIT License
## ๐จโ๐ป Author
Raihan Hidayatullah Djunaedi [raihanhd.dev@gmail.com](mailto:raihanhd.dev@gmail.com)
---
Untuk contoh penggunaan lengkap, lihat file `demo.py` di repository ini atau jalankan `python -m nahiarhdNLP.demo`.
Raw data
{
"_id": null,
"home_page": null,
"name": "nahiarhdNLP",
"maintainer": null,
"docs_url": null,
"requires_python": ">=3.8",
"maintainer_email": null,
"keywords": "nlp, indonesian, natural-language-processing, text-processing, bahasa-indonesia",
"author": null,
"author_email": "Raihan Hidayatullah Djunaedi <raihanhd.dev@gmail.com>",
"download_url": "https://files.pythonhosted.org/packages/27/2d/bcee0ae8c416649c041e98cdc9740455e2f2747af67399ce654f1c2fa6db/nahiarhdnlp-1.3.2.tar.gz",
"platform": null,
"description": "# nahiarhdNLP - Indonesian Natural Language Processing Library\n\nLibrary Indonesian Natural Language Processing dengan fitur preprocessing teks, normalisasi slang, konversi emoji, koreksi ejaan, dan berbagai fungsi text processing lainnya.\n\n## \ud83d\ude80 Instalasi\n\n```bash\npip install nahiarhdNLP\n```\n\n## \ud83d\udce6 Import Library\n\n```python\n# Import functions dari preprocessing\nfrom nahiarhdNLP.preprocessing import (\n # Fungsi pembersihan dasar\n remove_html, remove_emoji, remove_url, remove_mentions, remove_hashtags,\n remove_numbers, remove_punctuation, remove_extra_spaces,\n remove_special_chars, remove_whitespace, to_lowercase,\n # Fungsi normalisasi dan koreksi\n replace_spell_corrector, replace_repeated_chars,\n # Fungsi emoji\n emoji_to_words, words_to_emoji,\n # Fungsi linguistic\n remove_stopwords, stem_text, tokenize,\n # Fungsi pipeline\n pipeline, preprocess, Pipeline\n)\n\n# Import kelas untuk penggunaan advanced\nfrom nahiarhdNLP.preprocessing import (\n TextCleaner, SpellCorrector, StopwordRemover,\n Stemmer, EmojiConverter, Tokenizer\n)\n\n# Import dataset loader\nfrom nahiarhdNLP.datasets import DatasetLoader\n```\n\n## \ud83d\udccb Contoh Penggunaan\n\n### 1. \ud83e\uddf9 TextCleaner - Membersihkan Teks\n\n```python\nfrom nahiarhdNLP.preprocessing import TextCleaner\n\ncleaner = TextCleaner()\n\n# Membersihkan HTML tags\nhtml_text = \"website <a href='https://google.com'>google</a>\"\nclean_result = cleaner.clean_html(html_text)\nprint(clean_result)\n# Output: \"website google\"\n\n# Membersihkan URL\nurl_text = \"kunjungi https://google.com sekarang!\"\nclean_result = cleaner.clean_urls(url_text)\nprint(clean_result)\n# Output: \"kunjungi sekarang!\"\n\n# Membersihkan mentions\nmention_text = \"Halo @user123 apa kabar?\"\nclean_result = cleaner.clean_mentions(mention_text)\nprint(clean_result)\n# Output: \"Halo apa kabar?\"\n\n# Membersihkan emoji\nemoji_text = \"Halo dunia \ud83d\ude00\ud83d\ude01 apa kabar? \ud83c\udf89\"\nclean_result = cleaner.clean_emoji(emoji_text)\nprint(clean_result)\n# Output: \"Halo dunia apa kabar?\"\n```\n\n### 2. \u270f\ufe0f SpellCorrector - Koreksi Ejaan & Normalisasi Slang\n\n```python\nfrom nahiarhdNLP.preprocessing import SpellCorrector\n\nspell = SpellCorrector()\n\n# Koreksi kata salah eja\nword = \"sya\"\ncorrected = spell.correct_word(word)\nprint(corrected)\n# Output: \"saya\"\n\n# Koreksi kalimat lengkap (termasuk normalisasi slang)\nsentence = \"sya suka mkn nasi\"\ncorrected = spell.correct_sentence(sentence)\nprint(corrected)\n# Output: \"saya suka makan nasi\"\n\n# Normalisasi slang\nslang_text = \"gw lg di rmh\"\nnormalized = spell.correct_sentence(slang_text)\nprint(normalized)\n# Output: \"gue lagi di rumah\"\n```\n\n### 3. \ud83d\udeab StopwordRemover - Menghapus Stopwords\n\n```python\nfrom nahiarhdNLP.preprocessing import StopwordRemover\n\nstopword = StopwordRemover()\nstopword._load_data() # Load dataset stopwords\n\n# Menghapus stopwords\ntext = \"saya suka makan nasi goreng\"\nresult = stopword.remove_stopwords(text)\nprint(result)\n# Output: \"suka makan nasi goreng\"\n\n# Cek apakah kata adalah stopword\nis_stop = stopword.is_stopword(\"adalah\")\nprint(is_stop) # True\n```\n\n### 4. \ud83d\ude00 EmojiConverter - Konversi Emoji\n\n```python\nfrom nahiarhdNLP.preprocessing import EmojiConverter\n\nemoji = EmojiConverter()\nemoji._load_data() # Load dataset emoji\n\n# Emoji ke teks\nemoji_text = \"\ud83d\ude00 \ud83d\ude02 \ud83d\ude0d\"\ntext_result = emoji.emoji_to_text_convert(emoji_text)\nprint(text_result)\n# Output: \"wajah_gembira wajah_gembira_berurai_air_mata wajah_tersenyum_lebar_bermata_hati\"\n\n# Teks ke emoji\ntext = \"wajah_gembira\"\nemoji_result = emoji.text_to_emoji_convert(text)\nprint(emoji_result)\n# Output: \"\ud83d\ude00\"\n```\n\n### 5. \ud83d\udd2a Tokenizer - Tokenisasi\n\n```python\nfrom nahiarhdNLP.preprocessing import Tokenizer\n\ntokenizer = Tokenizer()\n\n# Tokenisasi teks\ntext = \"Saya suka makan nasi\"\ntokens = tokenizer.tokenize(text)\nprint(tokens)\n# Output: ['Saya', 'suka', 'makan', 'nasi']\n```\n\n### 6. \ud83c\udf3f Stemmer - Stemming\n\n```python\nfrom nahiarhdNLP.preprocessing import Stemmer\n\ntry:\n stemmer = Stemmer()\n text = \"bermain-main dengan senang\"\n result = stemmer.stem(text)\n print(result)\n # Output: \"main main dengan senang\"\nexcept ImportError:\n print(\"Install Sastrawi dengan: pip install Sastrawi\")\n```\n\n### 7. \ud83d\udee0\ufe0f Fungsi Individual\n\n```python\nfrom nahiarhdNLP.preprocessing import (\n remove_html, remove_emoji, remove_url, remove_mentions, remove_hashtags,\n remove_numbers, remove_punctuation, remove_extra_spaces,\n remove_special_chars, remove_whitespace, to_lowercase,\n replace_spell_corrector, replace_repeated_chars,\n emoji_to_words, words_to_emoji, remove_stopwords,\n stem_text, tokenize\n)\n\n# \ud83e\uddf9 FUNGSI PEMBERSIHAN DASAR\n\n# Menghapus HTML tags\nhtml_text = \"website <a href='https://google.com'>google</a>\"\nclean_result = remove_html(html_text)\nprint(clean_result)\n# Output: \"website google\"\n\n# Menghapus emoji\nemoji_text = \"Halo dunia \ud83d\ude00\ud83d\ude01 apa kabar? \ud83c\udf89\"\nclean_result = remove_emoji(emoji_text)\nprint(clean_result)\n# Output: \"Halo dunia apa kabar?\"\n\n# Menghapus URL\nurl_text = \"kunjungi https://google.com sekarang!\"\nclean_result = remove_url(url_text)\nprint(clean_result)\n# Output: \"kunjungi sekarang!\"\n\n# Menghapus mentions (@username)\nmention_text = \"Halo @user123 dan @admin apa kabar?\"\nclean_result = remove_mentions(mention_text)\nprint(clean_result)\n# Output: \"Halo dan apa kabar?\"\n\n# Menghapus hashtags (#tag)\nhashtag_text = \"Hari ini #senin #libur #weekend\"\nclean_result = remove_hashtags(hashtag_text)\nprint(clean_result)\n# Output: \"Hari ini\"\n\n# \u2728 FUNGSI NORMALISASI DAN KOREKSI\n\n# Normalisasi slang dan koreksi ejaan\nslang_text = \"emg siapa yg nanya?\"\nnormal_text = replace_spell_corrector(slang_text)\nprint(normal_text)\n# Output: \"memang siapa yang bertanya?\"\n\n# Mengatasi perpanjangan kata (word elongation)\nelongation_text = \"kenapaaa???\"\nclean_result = replace_repeated_chars(elongation_text)\nprint(clean_result)\n# Output: \"kenapaa??\"\n\n# \ud83d\ude00 FUNGSI EMOJI\n\n# Konversi emoji ke kata\nemoji_text = \"emoji \ud83d\ude00\ud83d\ude01\"\ntext_result = emoji_to_words(emoji_text)\nprint(text_result)\n# Output: \"emoji wajah_gembira wajah_gembira_dengan_mata_bahagia\"\n\n# Konversi kata ke emoji\ntext_to_emoji = \"emoji wajah_gembira\"\nemoji_result = words_to_emoji(text_to_emoji)\nprint(emoji_result)\n# Output: \"emoji \ud83d\ude00\"\n\n# \ud83d\udd2c FUNGSI LINGUISTIC\n\n# Menghapus stopwords\nstopword_text = \"siapa yang suruh makan?!!\"\nclean_result = remove_stopwords(stopword_text)\nprint(clean_result)\n# Output: \"suruh makan?!!\"\n\n# Stemming teks (memerlukan Sastrawi)\ntry:\n stem_text_input = \"bermain-main dengan senang\"\n stemmed = stem_text(stem_text_input)\n print(stemmed)\n # Output: \"main main dengan senang\"\nexcept ImportError:\n print(\"Install Sastrawi: pip install Sastrawi\")\n\n# Tokenisasi teks\ntokenize_text = \"Saya suka makan nasi\"\ntokens = tokenize(tokenize_text)\nprint(tokens)\n# Output: ['Saya', 'suka', 'makan', 'nasi']\n```\n\n### 8. \ud83d\udd00 Pipeline - Preprocessing Sekaligus\n\nPipeline mendukung **dua cara penggunaan**:\n\n#### A. \ud83d\ude80 Pipeline dengan Functions (Simple & Clean)\n\n```python\nfrom nahiarhdNLP.preprocessing import Pipeline, remove_html, remove_url, remove_mentions, to_lowercase\n\n# Langsung pass functions yang mau dipakai\npipeline = Pipeline(remove_html, remove_url, remove_mentions)\nresult = pipeline.process(\"Hello <b>world</b> @user https://example.com\")\nprint(result)\n# Output: \"Hello world\"\n\n# Bebas pilih functions sesuai kebutuhan\npipeline = Pipeline(remove_url, replace_spell_corrector, to_lowercase)\nresult = pipeline.process(\"Halooo https://google.com gw lg nyari info\")\nprint(result)\n# Output: \"halooo gue lagi mencari info\"\n\n# Pipeline bisa dipanggil langsung seperti function\nresult = pipeline(\"Test text lainnya\")\nprint(result)\n\n# Contoh untuk social media text\nsocial_pipeline = Pipeline(\n remove_mentions,\n remove_hashtags,\n remove_emoji,\n remove_url,\n replace_spell_corrector,\n to_lowercase\n)\nresult = social_pipeline.process(\"Halooo @user #trending \ud83d\ude00 https://example.com gw lg nyari info\")\nprint(result)\n# Output: \"halooo gue lagi mencari info\"\n\n# Tokenisasi juga bisa langsung\ntoken_pipeline = Pipeline(remove_url, to_lowercase, tokenize)\ntokens = token_pipeline.process(\"Hello https://google.com World\")\nprint(tokens) # ['hello', 'world']\n```\n\n#### B. \ud83c\udfaf Pipeline dengan Config Dictionary (Advanced)\n\n```python\nfrom nahiarhdNLP.preprocessing import Pipeline\n\n# Config dictionary untuk kontrol detail\nconfig = {\n \"remove_emoji\": True,\n \"remove_url\": True,\n \"remove_mentions\": True,\n \"remove_hashtags\": True,\n \"remove_numbers\": True,\n \"replace_spell_corrector\": True,\n \"to_lowercase\": True,\n \"remove_punctuation\": True,\n}\n\npipeline = Pipeline(config)\nresult = pipeline.process(\"Halooo @user123 #trending https://example.com gw lg nyari info pnting \ud83d\ude00!!! 123\")\nprint(result)\n# Output: \"halo gue lagi mencari info penting \ud83d\ude00!!! 123\"\n\n# Pipeline dengan tokenisasi\ntokenize_config = {\n \"remove_url\": True,\n \"remove_mentions\": True,\n \"replace_spell_corrector\": True,\n \"to_lowercase\": True,\n \"tokenize\": True,\n}\n\npipe = Pipeline(tokenize_config)\nresult = pipe.process(\"gw suka makan nasi @user\")\nprint(result)\n# Output: ['gue', 'suka', 'makan', 'nasi']\n\n# Advanced features untuk config mode\nprint(\"Current config:\", pipeline.get_config())\nprint(\"Enabled steps:\", pipeline.get_enabled_steps())\n\n# Update configuration\npipeline.update_config({\"tokenize\": True, \"remove_stopwords\": True})\n```\n\n#### C. \ud83d\udd27 Helper Function pipeline()\n\n```python\nfrom nahiarhdNLP.preprocessing import pipeline\n\n# Preprocessing langsung dengan config\nconfig = {\"remove_url\": True, \"replace_spell_corrector\": True, \"to_lowercase\": True}\nresult = pipeline(\"Gw lg browsing https://google.com\", config)\nprint(result)\n# Output: \"gue lagi rosin\"\n```\n\n#### \ud83d\udcdd Available Functions untuk Pipeline\n\n```python\n# Basic cleaning\nremove_html, remove_emoji, remove_url, remove_mentions, remove_hashtags,\nremove_numbers, remove_punctuation, remove_special_chars,\nremove_whitespace, remove_extra_spaces\n\n# Text transformation\nto_lowercase, replace_repeated_chars, replace_spell_corrector\n\n# Emoji handling\nemoji_to_words, words_to_emoji\n\n# Linguistic processing\nremove_stopwords, stem_text, tokenize\n```\n\n### 9. \ud83c\udf9b\ufe0f Preprocess Function (Backward Compatibility)\n\n```python\nfrom nahiarhdNLP.preprocessing import preprocess\n\n# Preprocessing dengan parameter eksplisit\nresult = preprocess(\n \"Halooo @user!!! 123 \ud83d\ude00\",\n remove_emoji=True,\n remove_mentions=True,\n remove_numbers=True,\n remove_punctuation=True,\n replace_repeated_chars=True,\n to_lowercase=True,\n replace_spell_corrector=False,\n)\nprint(result)\n# Output: \"haloo !! 123\"\n```\n\n### 10. \ud83d\udcca Dataset Loader\n\n```python\nfrom nahiarhdNLP.datasets import DatasetLoader\n\nloader = DatasetLoader()\n\n# Load stopwords dari CSV lokal\nstopwords = loader.load_stopwords_dataset()\nprint(f\"Jumlah stopwords: {len(stopwords)}\")\n\n# Load slang dictionary dari CSV lokal\nslang_dict = loader.load_slang_dataset()\nprint(f\"Jumlah slang: {len(slang_dict)}\")\n\n# Load emoji dictionary dari CSV lokal\nemoji_dict = loader.load_emoji_dataset()\nprint(f\"Jumlah emoji: {len(emoji_dict)}\")\n\n# Load wordlist dari JSON lokal\nwordlist = loader.load_wordlist_dataset()\nprint(f\"Jumlah kata: {len(wordlist)}\")\n```\n\n> **Catatan:** Semua dataset (stopword, slang, emoji, wordlist) di-load langsung dari file CSV/JSON di folder `nahiarhdNLP/datasets/`. Tidak ada proses download dari external source.\n\n## \ud83d\udd25 Demo Script\n\nUntuk melihat semua fitur library bekerja:\n\n```bash\npython -m nahiarhdNLP.demo\n```\n\nDemo ini menunjukkan:\n\n- \u2705 Semua fungsi individual utility\n- \u2705 Penggunaan class-based approach\n- \u2705 Pipeline system (functions & config)\n- \u2705 Advanced pipeline features\n- \u2705 Handling error dan troubleshooting\n\n## \ud83d\udea8 Error Handling\n\n```python\ntry:\n from nahiarhdNLP.preprocessing import SpellCorrector\n spell = SpellCorrector()\n result = spell.correct_sentence(\"test\")\nexcept ImportError:\n print(\"Package nahiarhdNLP belum terinstall\")\n print(\"Install dengan: pip install nahiarhdNLP\")\nexcept Exception as e:\n print(f\"Error: {e}\")\n```\n\n## \ud83d\udca1 Tips Penggunaan\n\n1. **Untuk preprocessing simple**: Gunakan `Pipeline(function1, function2, ...)` - langsung pass functions!\n2. **Untuk kontrol detail**: Gunakan `Pipeline(config_dict)` atau `preprocess()` dengan parameter boolean\n3. **Untuk kontrol penuh**: Gunakan kelas individual (`TextCleaner`, `SpellCorrector`, dll)\n4. **Untuk spell correction + slang**: Gunakan `SpellCorrector` yang menggabungkan kedua fitur\n5. **Untuk menghapus emoji**: Gunakan `remove_emoji()` atau set `remove_emoji=True` di Pipeline/preprocess\n6. **Untuk stemming**: Install Sastrawi terlebih dahulu: `pip install Sastrawi`\n7. **Untuk load dataset**: Gunakan `DatasetLoader` dari `nahiarhdNLP.datasets`\n8. **Untuk inisialisasi kelas**: Panggil `_load_data()` untuk kelas yang memerlukan dataset\n9. **Pipeline design**: `Pipeline(remove_url, to_lowercase)` lebih jelas daripada config dictionary\n10. **Function chaining**: Pipeline bisa dipanggil seperti function dengan `pipeline(\"text\")`\n11. **Demo testing**: Jalankan `python -m nahiarhdNLP.demo` untuk melihat semua fitur bekerja\n\n## \u26a1 Performance & Dataset\n\nnahiarhdNLP menggunakan **dataset lokal** yang sudah disediakan:\n\n- **Stopwords**: File `stop_word.csv` (788 kata)\n- **Slang Dictionary**: File `slang.csv` (15,675 pasangan)\n- **Emoji Mapping**: File `emoji.csv` (3,530 emoji)\n- **Wordlist**: File `wordlist.json` (kamus kata Indonesia)\n- **KBBI Dictionary**: File `kata_dasar_kbbi.csv` (28,527 kata)\n- **Kamus Tambahan**: File `kamus.txt` (30,871 kata)\n\nSemua dataset tersimpan di folder `nahiarhdNLP/datasets/` dan diakses melalui `DatasetLoader`.\n\n## \ud83d\udce6 Dependencies\n\nPackage ini membutuhkan:\n\n- `pandas` - untuk load dan proses dataset CSV/JSON\n- `Sastrawi` - untuk stemming (opsional)\n- `rich` - untuk output formatting di demo (opsional)\n\n## \ud83d\udd27 Struktur Modul\n\n```text\nnahiarhdNLP/\n\u251c\u2500\u2500 datasets/\n\u2502 \u251c\u2500\u2500 loaders.py # DatasetLoader class\n\u2502 \u251c\u2500\u2500 emoji.csv # Dataset emoji (3,530 entries)\n\u2502 \u251c\u2500\u2500 slang.csv # Dataset slang (15,675 entries)\n\u2502 \u251c\u2500\u2500 stop_word.csv # Dataset stopwords (788 entries)\n\u2502 \u251c\u2500\u2500 wordlist.json # Dataset wordlist\n\u2502 \u251c\u2500\u2500 kata_dasar_kbbi.csv # Dataset KBBI (28,527 entries)\n\u2502 \u2514\u2500\u2500 kamus.txt # Dataset kamus tambahan (30,871 entries)\n\u251c\u2500\u2500 preprocessing/\n\u2502 \u251c\u2500\u2500 cleaning/\n\u2502 \u2502 \u2514\u2500\u2500 text_cleaner.py # TextCleaner class\n\u2502 \u251c\u2500\u2500 linguistic/\n\u2502 \u2502 \u251c\u2500\u2500 stemmer.py # Stemmer class\n\u2502 \u2502 \u2514\u2500\u2500 stopwords.py # StopwordRemover class\n\u2502 \u251c\u2500\u2500 normalization/\n\u2502 \u2502 \u251c\u2500\u2500 emoji.py # EmojiConverter class\n\u2502 \u2502 \u2514\u2500\u2500 spell_corrector.py # SpellCorrector class\n\u2502 \u251c\u2500\u2500 tokenization/\n\u2502 \u2502 \u2514\u2500\u2500 tokenizer.py # Tokenizer class\n\u2502 \u2514\u2500\u2500 utils.py # Fungsi utility individual & Pipeline\n\u2514\u2500\u2500 demo.py # File demo penggunaan\n```\n\n## \ud83c\udd95 Changelog Versi 1.5.0\n\n- \ud83d\ude80 **[FITUR BARU]** Menambahkan `remove_emoji()` function untuk menghapus emoji dari teks\n- \u2705 **[BARU]** TextCleaner sekarang memiliki method `clean_emoji()` untuk menghapus emoji\n- \u2705 **[BARU]** Pipeline mendukung \"remove_emoji\" config untuk emoji removal\n- \u2705 **[BARU]** Preprocess function mendukung parameter `remove_emoji=True/False`\n- \u2705 **[PERBAIKAN]** Demo script diperbarui dengan contoh emoji removal\n- \u2705 **[PERBAIKAN]** Dokumentasi lengkap untuk fitur emoji removal\n- \ud83d\ude80 **[MAJOR]** Pipeline sekarang mendukung 2 mode: Functions dan Config Dictionary\n- \u2705 **[BARU]** Pipeline dengan functions: `Pipeline(remove_url, to_lowercase)`\n- \u2705 **[BARU]** Pipeline dengan config: `Pipeline({\"remove_url\": True, \"to_lowercase\": True})`\n- \u2705 **[BARU]** Advanced pipeline features: `get_config()`, `get_enabled_steps()`, `update_config()`\n- \u2705 **[PERBAIKAN]** Fungsi `pipeline(text, config)` sekarang bekerja dengan config dictionary\n- \u2705 **[PERBAIKAN]** TextCleaner sekarang punya method `clean_html()` yang benar\n- \u2705 **[PERBAIKAN]** SpellCorrector demo diperbaiki dengan proper instantiation\n- \u2705 **[PERBAIKAN]** Demo script berjalan sempurna tanpa error\n- \u2705 **[PERBAIKAN]** Dokumentasi yang akurat dan sesuai implementasi\n- \u2705 **[PERBAIKAN]** Function names yang konsisten: `replace_spell_corrector`, `replace_repeated_chars`\n- \u2705 **[PERBAIKAN]** Backward compatibility dengan `preprocess()` function\n- \u2705 Menggabungkan spell correction dan slang normalization dalam `SpellCorrector`\n- \u2705 Semua dataset menggunakan file lokal (CSV/JSON)\n- \u2705 Struktur yang lebih terorganisir dengan pemisahan kelas dan fungsi\n- \u2705 Penambahan `DatasetLoader` untuk manajemen dataset terpusat\n- \u2705 Dataset lengkap dengan 6 file berbeda (emoji, slang, stopwords, wordlist, KBBI, kamus)\n\n## \ud83d\udc1b Troubleshooting\n\n**Error saat import dataset:**\n\n```python\n# Pastikan memanggil _load_data() untuk kelas yang memerlukan dataset\nstopword = StopwordRemover()\nstopword._load_data() # Penting!\n```\n\n**Error Sastrawi tidak ditemukan:**\n\n```bash\npip install Sastrawi\n```\n\n**Error pandas tidak ditemukan:**\n\n```bash\npip install pandas\n```\n\n**Testing semua fitur:**\n\n```bash\npython -m nahiarhdNLP.demo\n```\n\n## \ud83d\udcc4 License\n\nMIT License\n\n## \ud83d\udc68\u200d\ud83d\udcbb Author\n\nRaihan Hidayatullah Djunaedi [raihanhd.dev@gmail.com](mailto:raihanhd.dev@gmail.com)\n\n---\n\nUntuk contoh penggunaan lengkap, lihat file `demo.py` di repository ini atau jalankan `python -m nahiarhdNLP.demo`.\n",
"bugtrack_url": null,
"license": "MIT",
"summary": "Advanced Indonesian Natural Language Processing Library",
"version": "1.3.2",
"project_urls": {
"Documentation": "https://example.com",
"Homepage": "https://example.com",
"Issues": "https://github.com/raihanhd12/nahiarhdNLP/issues",
"Repository": "https://github.com/raihanhd12/nahiarhdNLP"
},
"split_keywords": [
"nlp",
" indonesian",
" natural-language-processing",
" text-processing",
" bahasa-indonesia"
],
"urls": [
{
"comment_text": null,
"digests": {
"blake2b_256": "4917d4b53aae63c0104bebe5932940959743d85d283ec90352e8d4eebc5f30a3",
"md5": "3c08dd97d04f874f6a0342d0fa81c3e7",
"sha256": "b742cd2be667072089e888cff01bbc140cd8edd2bce444eeb2993f68ae40c339"
},
"downloads": -1,
"filename": "nahiarhdnlp-1.3.2-py3-none-any.whl",
"has_sig": false,
"md5_digest": "3c08dd97d04f874f6a0342d0fa81c3e7",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": ">=3.8",
"size": 789906,
"upload_time": "2025-07-28T08:24:16",
"upload_time_iso_8601": "2025-07-28T08:24:16.858924Z",
"url": "https://files.pythonhosted.org/packages/49/17/d4b53aae63c0104bebe5932940959743d85d283ec90352e8d4eebc5f30a3/nahiarhdnlp-1.3.2-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": null,
"digests": {
"blake2b_256": "272dbcee0ae8c416649c041e98cdc9740455e2f2747af67399ce654f1c2fa6db",
"md5": "b6f8c6a6cc01cbc1655fdaa2ecf706a0",
"sha256": "c579f054541816ab9ed8cea584cd453f693a70aa194d874a567bd2e7f753a01a"
},
"downloads": -1,
"filename": "nahiarhdnlp-1.3.2.tar.gz",
"has_sig": false,
"md5_digest": "b6f8c6a6cc01cbc1655fdaa2ecf706a0",
"packagetype": "sdist",
"python_version": "source",
"requires_python": ">=3.8",
"size": 787148,
"upload_time": "2025-07-28T08:24:19",
"upload_time_iso_8601": "2025-07-28T08:24:19.032803Z",
"url": "https://files.pythonhosted.org/packages/27/2d/bcee0ae8c416649c041e98cdc9740455e2f2747af67399ce654f1c2fa6db/nahiarhdnlp-1.3.2.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2025-07-28 08:24:19",
"github": true,
"gitlab": false,
"bitbucket": false,
"codeberg": false,
"github_user": "raihanhd12",
"github_project": "nahiarhdNLP",
"github_not_found": true,
"lcname": "nahiarhdnlp"
}