# VMP
Generate Vocabulary Management Profiles (vmp) for an individual text or corpus (text datasets).
from vmp import VMP, LoadData
# Example 1: Using a list of strings
data = ["This is the first text.", "Here is the second text."]
result = VMP.calculate(
data=data,
delta_values=[9, 11], # Select odd number/s for delta values
common_words_option='both', # Options: 'yes', 'no', 'both'
num_common_words=1000, # Optional parameter for number of common words
common_words_url='https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english.txt',
# common_words_file='path_to_your_common_words_file.txt', # Alternatively, use this
clean_option=True # Default is True
)
print("Results for list of strings:")
print(result)
# Example 2: Using a DataFrame with .txt files
data_loader = LoadData()
df_txt = data_loader.load_data('path_to_your_txt_files_directory', file_type='txt')
result_txt = VMP.calculate(
data=df_txt,
delta_values=[9, 11], # Select odd number/s for delta values
common_words_option='both', # Options: 'yes', 'no', 'both'
num_common_words=1000, # Optional parameter for number of common words
common_words_url='https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english.txt',
# common_words_file='path_to_your_common_words_file.txt', # Alternatively, use this
clean_option=True # Default is True
)
print("Results for DataFrame with .txt files:")
print(result_txt)
# Example 3: Using a DataFrame with .csv file
data_loader = LoadData()
df_csv = data_loader.load_data('path_to_your_csv_file.csv', file_type='csv')
result_csv = VMP.calculate(
data=df_csv,
delta_values=[9, 11], # Select odd number/s for delta values
common_words_option='both', # Options: 'yes', 'no', 'both'
num_common_words=1000, # Optional parameter for number of common words
common_words_url='https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english.txt',
# common_words_file='path_to_your_common_words_file.txt', # Alternatively, use this
clean_option=True # Default is True
)
print("Results for DataFrame with .csv file:")
print(result_csv)
# Example 4: Using a DataFrame with .gz file
data_loader = LoadData()
df_gz = data_loader.load_data('path_to_your_gz_file.gz', file_type='gz')
result_gz = VMP.calculate(
data=df_gz,
delta_values=[9, 11], # Select odd number/s for delta values
common_words_option='both', # Options: 'yes', 'no', 'both'
num_common_words=1000, # Optional parameter for number of common words
common_words_url='https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english.txt',
# common_words_file='path_to_your_common_words_file.txt', # Alternatively, use this
clean_option=True # Default is True
)
print("Results for DataFrame with .gz file:")
print(result_gz)
The package contains all preprocessing. Only the delta_x and stopword list need to be specified.
# Input
The **VMP.calculate** method requires a text or corpus input. These can be loaded either as an individual .txt document, a directory, or corpus, containing multiple .txt documents, or a .csv or .gz file where each row contains the text of a particular document. (supports .txt and .gz files).
# Output
The vmp.calculate function returns a dictionary where the results are structured as follows:
index: The index position of the interval in the original text.
last_pos: The position of the last token in the interval within the original text.
avg_score: The average score for the interval, representing the relative distance of repeated tokens within the window.
last_word: The last word in the interval.
context: The text within the interval, providing context for the analysis.
last_previous_position: A dictionary showing the last previous position of each token in the interval before the current window.
filename: The source filename or identifier of the text being analyzed.
delta_x: The size of the interval (window) used in the analysis.
vocab_option: Indicates whether common words were replaced with 'x' (commonYes) or not (commonNo).
# Installation
pip install vmp
pip install git+https://github.com/matthewdurward/vmp.git
# How It Works
Vocabulary Management Profiles (VMPs) were initially conceived by Youmans (https://journals.sagepub.com/doi/abs/10.2190/BY6N-ABUA-EM1D-RX0V) as a form of discourse and narrative analysis.
This package follows Youmans' implementation of the VMP2.2 (https://web.archive.org/web/20060911150345/http://web.missouri.edu/~youmansc/vmp/help/vmp22.html)
VMP2.2 calculates ratios using a wrap-around method during the second pass through the text. This means that the first occurrence of a word near the beginning of the text is compared to its last occurrence near the end, resulting in a ratio closer to 0.0 rather than 1.0. Words that appear only once in the text retain a ratio of 1.0. Unlike the initial pass analysis, VMP2.2 avoids a rapid downtrend at the beginning of the text, reflecting a more familiar second reading where the start of the text is as well-known as the end. This approach aligns with our typical reading patterns, where rhetorical structures are more evident during subsequent readings rather than the first.
Raw data
{
"_id": null,
"home_page": "https://github.com/matthewdurward/vmp",
"name": "vmp",
"maintainer": null,
"docs_url": null,
"requires_python": null,
"maintainer_email": null,
"keywords": "text analytics, natural language processing, computational linguistics, vocabulary, lexical diversity, corpus, corpora",
"author": "Matthew Durward, Christopher Thomson",
"author_email": "matthew.durward@pg.canterbury.ac.nz",
"download_url": "https://files.pythonhosted.org/packages/25/78/f462c408d80dba6d2bd4ec4cb03579476888f3685e3c594fbb1e771b3ddb/vmp-0.0.13.tar.gz",
"platform": null,
"description": "# VMP\r\n\r\nGenerate Vocabulary Management Profiles (vmp) for an individual text or corpus (text datasets). \r\n\r\n from vmp import VMP, LoadData\r\n \r\n # Example 1: Using a list of strings\r\n data = [\"This is the first text.\", \"Here is the second text.\"]\r\n result = VMP.calculate(\r\n data=data,\r\n delta_values=[9, 11], # Select odd number/s for delta values\r\n common_words_option='both', # Options: 'yes', 'no', 'both'\r\n num_common_words=1000, # Optional parameter for number of common words\r\n common_words_url='https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english.txt',\r\n # common_words_file='path_to_your_common_words_file.txt', # Alternatively, use this\r\n clean_option=True # Default is True\r\n )\r\n print(\"Results for list of strings:\")\r\n print(result)\r\n\r\n # Example 2: Using a DataFrame with .txt files\r\n data_loader = LoadData()\r\n df_txt = data_loader.load_data('path_to_your_txt_files_directory', file_type='txt')\r\n result_txt = VMP.calculate(\r\n data=df_txt,\r\n delta_values=[9, 11], # Select odd number/s for delta values\r\n common_words_option='both', # Options: 'yes', 'no', 'both'\r\n num_common_words=1000, # Optional parameter for number of common words\r\n common_words_url='https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english.txt',\r\n # common_words_file='path_to_your_common_words_file.txt', # Alternatively, use this\r\n clean_option=True # Default is True\r\n )\r\n print(\"Results for DataFrame with .txt files:\")\r\n print(result_txt)\r\n\r\n # Example 3: Using a DataFrame with .csv file\r\n data_loader = LoadData()\r\n df_csv = data_loader.load_data('path_to_your_csv_file.csv', file_type='csv')\r\n result_csv = VMP.calculate(\r\n data=df_csv,\r\n delta_values=[9, 11], # Select odd number/s for delta values\r\n common_words_option='both', # Options: 'yes', 'no', 'both'\r\n num_common_words=1000, # Optional parameter for number of common words\r\n common_words_url='https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english.txt',\r\n # common_words_file='path_to_your_common_words_file.txt', # Alternatively, use this\r\n clean_option=True # Default is True\r\n )\r\n print(\"Results for DataFrame with .csv file:\")\r\n print(result_csv)\r\n\r\n # Example 4: Using a DataFrame with .gz file\r\n data_loader = LoadData()\r\n df_gz = data_loader.load_data('path_to_your_gz_file.gz', file_type='gz')\r\n result_gz = VMP.calculate(\r\n data=df_gz,\r\n delta_values=[9, 11], # Select odd number/s for delta values\r\n common_words_option='both', # Options: 'yes', 'no', 'both'\r\n num_common_words=1000, # Optional parameter for number of common words\r\n common_words_url='https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english.txt',\r\n # common_words_file='path_to_your_common_words_file.txt', # Alternatively, use this\r\n clean_option=True # Default is True\r\n )\r\n print(\"Results for DataFrame with .gz file:\")\r\n print(result_gz)\r\n\r\n\r\nThe package contains all preprocessing. Only the delta_x and stopword list need to be specified.\r\n\r\n# Input\r\n\r\nThe **VMP.calculate** method requires a text or corpus input. These can be loaded either as an individual .txt document, a directory, or corpus, containing multiple .txt documents, or a .csv or .gz file where each row contains the text of a particular document. (supports .txt and .gz files).\r\n\r\n# Output\r\n\r\nThe vmp.calculate function returns a dictionary where the results are structured as follows:\r\n\r\n index: The index position of the interval in the original text.\r\n last_pos: The position of the last token in the interval within the original text.\r\n avg_score: The average score for the interval, representing the relative distance of repeated tokens within the window.\r\n last_word: The last word in the interval.\r\n context: The text within the interval, providing context for the analysis.\r\n last_previous_position: A dictionary showing the last previous position of each token in the interval before the current window.\r\n filename: The source filename or identifier of the text being analyzed.\r\n delta_x: The size of the interval (window) used in the analysis.\r\n vocab_option: Indicates whether common words were replaced with 'x' (commonYes) or not (commonNo).\r\n\r\n# Installation\r\n\r\n pip install vmp\r\n\r\n pip install git+https://github.com/matthewdurward/vmp.git\r\n \r\n# How It Works\r\n\r\nVocabulary Management Profiles (VMPs) were initially conceived by Youmans (https://journals.sagepub.com/doi/abs/10.2190/BY6N-ABUA-EM1D-RX0V) as a form of discourse and narrative analysis. \r\n\r\nThis package follows Youmans' implementation of the VMP2.2 (https://web.archive.org/web/20060911150345/http://web.missouri.edu/~youmansc/vmp/help/vmp22.html)\r\n\r\nVMP2.2 calculates ratios using a wrap-around method during the second pass through the text. This means that the first occurrence of a word near the beginning of the text is compared to its last occurrence near the end, resulting in a ratio closer to 0.0 rather than 1.0. Words that appear only once in the text retain a ratio of 1.0. Unlike the initial pass analysis, VMP2.2 avoids a rapid downtrend at the beginning of the text, reflecting a more familiar second reading where the start of the text is as well-known as the end. This approach aligns with our typical reading patterns, where rhetorical structures are more evident during subsequent readings rather than the first.\r\n\r\n",
"bugtrack_url": null,
"license": "GNU GENERAL PUBLIC LICENSE v3",
"summary": "Generating Vocabulary Management Profiles in Python",
"version": "0.0.13",
"project_urls": {
"Homepage": "https://github.com/matthewdurward/vmp"
},
"split_keywords": [
"text analytics",
" natural language processing",
" computational linguistics",
" vocabulary",
" lexical diversity",
" corpus",
" corpora"
],
"urls": [
{
"comment_text": "",
"digests": {
"blake2b_256": "4e1c9a18d2f32cf3b8749df071e70187884f5ef8b5336612b37f353496c393e3",
"md5": "c0bd5ef9d97d64e37114f9c7b961d498",
"sha256": "9b0af53ced9fa8373cb631a32f40a52d733cdf7c331288ee72e39eb0a82adc96"
},
"downloads": -1,
"filename": "vmp-0.0.13-py3-none-any.whl",
"has_sig": false,
"md5_digest": "c0bd5ef9d97d64e37114f9c7b961d498",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": null,
"size": 7033,
"upload_time": "2024-06-03T07:54:36",
"upload_time_iso_8601": "2024-06-03T07:54:36.053098Z",
"url": "https://files.pythonhosted.org/packages/4e/1c/9a18d2f32cf3b8749df071e70187884f5ef8b5336612b37f353496c393e3/vmp-0.0.13-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": "",
"digests": {
"blake2b_256": "2578f462c408d80dba6d2bd4ec4cb03579476888f3685e3c594fbb1e771b3ddb",
"md5": "367dbd8ddbf21696e34e336218aa5eb6",
"sha256": "2fbd042c519cb5689b239d5884885a3b231cb3b2c2d3c809f22171a1278023b7"
},
"downloads": -1,
"filename": "vmp-0.0.13.tar.gz",
"has_sig": false,
"md5_digest": "367dbd8ddbf21696e34e336218aa5eb6",
"packagetype": "sdist",
"python_version": "source",
"requires_python": null,
"size": 8569,
"upload_time": "2024-06-03T07:54:37",
"upload_time_iso_8601": "2024-06-03T07:54:37.301609Z",
"url": "https://files.pythonhosted.org/packages/25/78/f462c408d80dba6d2bd4ec4cb03579476888f3685e3c594fbb1e771b3ddb/vmp-0.0.13.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2024-06-03 07:54:37",
"github": true,
"gitlab": false,
"bitbucket": false,
"codeberg": false,
"github_user": "matthewdurward",
"github_project": "vmp",
"travis_ci": false,
"coveralls": false,
"github_actions": false,
"lcname": "vmp"
}