# Adaptive Trainer
A Hugging Face `Trainer` extension designed for more efficient and effective fine-tuning of language models using adaptive loss mechanisms and curriculum learning principles.
## Features
* **Adaptive Loss Calculation:** Focuses training on tokens where the model is less confident or incorrect, rather than all tokens equally.
* **Ideas Learning:** Allows model to learn concepts and behaviour from training data rather than actual memorisation much like student learning.
* **Learning Style Differentiation:** Also allows specific datasets to be learnt with attention and concepts with ideas.Allows for different training objectives (e.g., "ideas" vs. "attention") based on dataset characteristics.
* **Integration with Hugging Face:** Built on top of the `transformers` library and `Trainer` API.
* **More Features** like parameter control and hyperparameter optimisation of loss for user comming soon
## Installation
You can install `adaptive-trainer` using pip:
```bash
pip install adaptive-trainer
```
*For local directly from repo, clone and checkout to the required branch and run:
```bash
pip install .
```
*if you want to install with optional dependencies like flash-attention:*
```bash
pip install flash-attn --no-build-isolation
pip install bitsandbytes
```
## Quick Start
```python
from adaptive_trainer import train_adaptively, AdaptiveTrainer # Assuming you export AdaptiveTrainer too
# Define system prompts
system_prompts = {
'both': "Your role as an assistant is ...",
'ideas': "Your role as an assistant is ...",
'attention': "Your role as an assistant is ..."
}
data_processing_function = lambda sample, context_mode: (sample['user'], sample['assistant'])
# Yes, for now you can't train more than 1 assistant response on one sample, if you need a histroy of conversation to train the next model response
# a trick to do is respond first element `sample['user']` with the assistant response included, you will need to however add the header start and close tokens for separating user and assistant texts mannually.
# Configure datasets
datasets_config = {
'ideas': ["user/my-ideas-dataset:|:K"],
'attention': ["user/my-attention-dataset"],
'both': [],
'misc': [],
'data_processing_function':data_processing_function,
# 'data_processing_function_attention':None,
# 'data_processing_function_ideas':None
# 'data_processing_function_dataset_name':None
}
# The above will trim the `user/my-ideas-dataset` till top K values (after filtering by max_token_length) and similar thing can be done for attention datasets too just add :|:K at the end of dataset name and it will only use top K rows of the dataset..
# The validation set will be proportionately be scaled down = max(K*len(val)/len(train), 10) if len(val)>10 not scaled otherwise
# Dataset preprocessing function which is expected to convert dataset sample row to user and assistant response strings.. is taken in the following fallback order:
# data_processing_function_dataset_name -> data_processing_function_{ideas/attention} -> data_processing_function
# **IMPORTANT** : passing data_processing_function_dataset_name with value None will not use fallback functions but rather used internal hard coded preprocessing function for different dataset patterns case to case.
# The hardcoded preprocessing might fail and raise error, therefore please pass the appropriate data_processing_function
dataset_kwargs = {
'user/my-ideas-dataset:|:K': {'data_dir'=None, 'context_mode':'<parameter_value_for_dataset_preprocessing_function>', 'train_split':'<train_split_name>', 'val_split':'<val_split_name>'},
'user/my-ideas-dataset': {}, # Both keys are valid and should work
'user/my-attention-dataset': {}
}
# If you want to added custom arguments to `load_dataset` function from `datasets` library
# optional can be `None`
# train_split, val_split -> if the dataset has splits available they can be passed here, dataset is expected to have at least one split at high level when loaded
dataset_specific_system_prompts = {
'user/my-ideas-dataset:|:K': 'Dataset_specific_prompt',
'user/my-ideas-dataset': 'Dataset_specific_prompt', # Both keys are valid and should work
'user/my-attention-dataset': 'Dataset_specific_prompt'
}
# dataset specific system prompt will get appended to master system prompt: system_prompt+dataset_specific_prompt
# optional can be `None`
# Configure training parameters
# Below are default values for all currently supported user_inputs:
training_config = {
'run_name': 'my_adaptive_run',
'wandb_project': 'my-adaptive-experiments',
'wandb_entity': None
'max_length_token': 4096, # datasets will be filtered with samples (user+assistant text) less than this value
'padding_side': 'left',
'batch_size': 4,
'learning_rate': 2e-5,
'gradient_accumulation_steps': 16,
'num_epochs': 3,
'attn_impl': None, # will use default implementation `sdpa` if available or `eager` otherwise
'save_total_limit': 5,
'fp16': True,
'gradient_checkpointing': True,
'eval_strategy': 'steps',
'save_strategy': 'steps',
'local_rank': -1,
'logging_steps':10,
'weight_decay': 0.01,
'warmup_steps':10,
'eval_steps': 200,
'save_steps': 300,
'eval_batch_size': 4, # defaults to batch_size
'optimizer': 'paged_adamw_8bit',
'use_liger_kernel': False,
'eval_on_start': False
}
# Configure adaptive loss parameters
adaptive_loss_config = {
'top_k': 8 # Currently only top_k parameter is allowed for user_control in training
'adaptive_log_steps': 100 # after every these many training steps some adaptive training metrics will be logged
}
# It is expected that a lower value of top_k will lead to stricter learning possibly leading previous learned behaviour
# being forgotten and a higher value of top_k will allow more lienient learning
# top_k=1 can cause ideas learning to behave the same as attention learning
# Run training
model_path = train_adaptively(
model_name="model_to_train",
datasets_config=datasets_config,
datasets_kwargs =datasets_kwargs,
dataset_specific_system_prompts=dataset_specific_system_prompts,
output_dir="./my_finetuned_model_adaptive",
huggingface_repo="your_username/my_finetuned_model_adaptive", # Optional, if provided and the environ variable HF_TOKEN is detected then model will be uploaded
system_prompts=system_prompts,
training_config=training_config,
adaptive_loss_config=adaptive_loss_config
)
print(f"Training complete. Model saved to {model_path}")
```
## Contributing
... (Details on how others can contribute) not yet revealed ...
Open issues as of now
## License
This project is licensed under Apache License Version 2.0
Raw data
{
"_id": null,
"home_page": null,
"name": "adaptive-trainer",
"maintainer": null,
"docs_url": null,
"requires_python": ">=3.9",
"maintainer_email": null,
"keywords": "adaptive loss, deep learning, huggingface, ideas learning, machine learning, nlp, trainer, transformers",
"author": null,
"author_email": "Samarth Pusalkar <samarthpusalkar@gmail.com>",
"download_url": "https://files.pythonhosted.org/packages/9f/4c/52bc230696f618d60566c373c2f73324de043a34c21d340d051915f4fbe0/adaptive_trainer-0.0.5.tar.gz",
"platform": null,
"description": "# Adaptive Trainer\n\nA Hugging Face `Trainer` extension designed for more efficient and effective fine-tuning of language models using adaptive loss mechanisms and curriculum learning principles.\n\n## Features\n\n* **Adaptive Loss Calculation:** Focuses training on tokens where the model is less confident or incorrect, rather than all tokens equally.\n* **Ideas Learning:** Allows model to learn concepts and behaviour from training data rather than actual memorisation much like student learning.\n* **Learning Style Differentiation:** Also allows specific datasets to be learnt with attention and concepts with ideas.Allows for different training objectives (e.g., \"ideas\" vs. \"attention\") based on dataset characteristics.\n* **Integration with Hugging Face:** Built on top of the `transformers` library and `Trainer` API.\n* **More Features** like parameter control and hyperparameter optimisation of loss for user comming soon\n\n\n## Installation\n\nYou can install `adaptive-trainer` using pip:\n\n```bash\npip install adaptive-trainer\n```\n\n*For local directly from repo, clone and checkout to the required branch and run:\n```bash\npip install .\n```\n*if you want to install with optional dependencies like flash-attention:*\n```bash\npip install flash-attn --no-build-isolation\npip install bitsandbytes\n```\n\n## Quick Start\n\n```python\nfrom adaptive_trainer import train_adaptively, AdaptiveTrainer # Assuming you export AdaptiveTrainer too\n\n# Define system prompts\nsystem_prompts = {\n 'both': \"Your role as an assistant is ...\",\n 'ideas': \"Your role as an assistant is ...\",\n 'attention': \"Your role as an assistant is ...\"\n}\n\ndata_processing_function = lambda sample, context_mode: (sample['user'], sample['assistant'])\n# Yes, for now you can't train more than 1 assistant response on one sample, if you need a histroy of conversation to train the next model response\n# a trick to do is respond first element `sample['user']` with the assistant response included, you will need to however add the header start and close tokens for separating user and assistant texts mannually.\n\n# Configure datasets\ndatasets_config = {\n 'ideas': [\"user/my-ideas-dataset:|:K\"],\n 'attention': [\"user/my-attention-dataset\"],\n 'both': [],\n 'misc': [],\n 'data_processing_function':data_processing_function,\n # 'data_processing_function_attention':None,\n # 'data_processing_function_ideas':None\n # 'data_processing_function_dataset_name':None\n}\n# The above will trim the `user/my-ideas-dataset` till top K values (after filtering by max_token_length) and similar thing can be done for attention datasets too just add :|:K at the end of dataset name and it will only use top K rows of the dataset..\n# The validation set will be proportionately be scaled down = max(K*len(val)/len(train), 10) if len(val)>10 not scaled otherwise\n# Dataset preprocessing function which is expected to convert dataset sample row to user and assistant response strings.. is taken in the following fallback order:\n# data_processing_function_dataset_name -> data_processing_function_{ideas/attention} -> data_processing_function\n# **IMPORTANT** : passing data_processing_function_dataset_name with value None will not use fallback functions but rather used internal hard coded preprocessing function for different dataset patterns case to case.\n# The hardcoded preprocessing might fail and raise error, therefore please pass the appropriate data_processing_function\n\ndataset_kwargs = {\n 'user/my-ideas-dataset:|:K': {'data_dir'=None, 'context_mode':'<parameter_value_for_dataset_preprocessing_function>', 'train_split':'<train_split_name>', 'val_split':'<val_split_name>'},\n 'user/my-ideas-dataset': {}, # Both keys are valid and should work\n 'user/my-attention-dataset': {}\n}\n# If you want to added custom arguments to `load_dataset` function from `datasets` library\n# optional can be `None`\n# train_split, val_split -> if the dataset has splits available they can be passed here, dataset is expected to have at least one split at high level when loaded\n\ndataset_specific_system_prompts = {\n 'user/my-ideas-dataset:|:K': 'Dataset_specific_prompt',\n 'user/my-ideas-dataset': 'Dataset_specific_prompt', # Both keys are valid and should work\n 'user/my-attention-dataset': 'Dataset_specific_prompt'\n}\n# dataset specific system prompt will get appended to master system prompt: system_prompt+dataset_specific_prompt\n# optional can be `None`\n\n# Configure training parameters\n# Below are default values for all currently supported user_inputs:\ntraining_config = {\n 'run_name': 'my_adaptive_run',\n 'wandb_project': 'my-adaptive-experiments',\n 'wandb_entity': None\n 'max_length_token': 4096, # datasets will be filtered with samples (user+assistant text) less than this value\n 'padding_side': 'left',\n 'batch_size': 4,\n 'learning_rate': 2e-5,\n 'gradient_accumulation_steps': 16,\n 'num_epochs': 3,\n 'attn_impl': None, # will use default implementation `sdpa` if available or `eager` otherwise\n 'save_total_limit': 5,\n 'fp16': True,\n 'gradient_checkpointing': True,\n 'eval_strategy': 'steps',\n 'save_strategy': 'steps',\n 'local_rank': -1,\n 'logging_steps':10,\n 'weight_decay': 0.01,\n 'warmup_steps':10,\n 'eval_steps': 200,\n 'save_steps': 300,\n 'eval_batch_size': 4, # defaults to batch_size\n 'optimizer': 'paged_adamw_8bit',\n 'use_liger_kernel': False,\n 'eval_on_start': False\n}\n\n# Configure adaptive loss parameters\nadaptive_loss_config = {\n 'top_k': 8 # Currently only top_k parameter is allowed for user_control in training\n 'adaptive_log_steps': 100 # after every these many training steps some adaptive training metrics will be logged\n}\n# It is expected that a lower value of top_k will lead to stricter learning possibly leading previous learned behaviour\n# being forgotten and a higher value of top_k will allow more lienient learning\n# top_k=1 can cause ideas learning to behave the same as attention learning\n\n# Run training\nmodel_path = train_adaptively(\n model_name=\"model_to_train\",\n datasets_config=datasets_config,\n datasets_kwargs =datasets_kwargs,\n dataset_specific_system_prompts=dataset_specific_system_prompts,\n output_dir=\"./my_finetuned_model_adaptive\",\n huggingface_repo=\"your_username/my_finetuned_model_adaptive\", # Optional, if provided and the environ variable HF_TOKEN is detected then model will be uploaded\n system_prompts=system_prompts,\n training_config=training_config,\n adaptive_loss_config=adaptive_loss_config\n)\n\nprint(f\"Training complete. Model saved to {model_path}\")\n```\n\n## Contributing\n... (Details on how others can contribute) not yet revealed ...\nOpen issues as of now\n\n## License\nThis project is licensed under Apache License Version 2.0\n",
"bugtrack_url": null,
"license": null,
"summary": "A Hugging Face Trainer extension for adaptive loss and ideas learning.",
"version": "0.0.5",
"project_urls": {
"Homepage": "https://github.com/samarthpusalkar/Adaptive_Trainer",
"Issues": "https://github.com/samarthpusalkar/Adaptive_Trainer/issues"
},
"split_keywords": [
"adaptive loss",
" deep learning",
" huggingface",
" ideas learning",
" machine learning",
" nlp",
" trainer",
" transformers"
],
"urls": [
{
"comment_text": null,
"digests": {
"blake2b_256": "16b3ab2ef98347eb2892ab80dd48e10efe76bb9a6e7b94084bb88df0b0cd9e22",
"md5": "b97d0ab95e855f3897300dfbf5cd7b16",
"sha256": "4c5cfa8d2dc337cd13d9ac171d1430c367ef880ed9285a552abe3928b23784e0"
},
"downloads": -1,
"filename": "adaptive_trainer-0.0.5-py3-none-any.whl",
"has_sig": false,
"md5_digest": "b97d0ab95e855f3897300dfbf5cd7b16",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": ">=3.9",
"size": 22719,
"upload_time": "2025-07-15T15:45:00",
"upload_time_iso_8601": "2025-07-15T15:45:00.247862Z",
"url": "https://files.pythonhosted.org/packages/16/b3/ab2ef98347eb2892ab80dd48e10efe76bb9a6e7b94084bb88df0b0cd9e22/adaptive_trainer-0.0.5-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": null,
"digests": {
"blake2b_256": "9f4c52bc230696f618d60566c373c2f73324de043a34c21d340d051915f4fbe0",
"md5": "8b9c4e4df8b10c84492a8d2eeb4deac0",
"sha256": "b7a40fdf1270de212c4c74ba7c877f56de75416efa85a0194b437215c502c165"
},
"downloads": -1,
"filename": "adaptive_trainer-0.0.5.tar.gz",
"has_sig": false,
"md5_digest": "8b9c4e4df8b10c84492a8d2eeb4deac0",
"packagetype": "sdist",
"python_version": "source",
"requires_python": ">=3.9",
"size": 23382,
"upload_time": "2025-07-15T15:45:01",
"upload_time_iso_8601": "2025-07-15T15:45:01.142976Z",
"url": "https://files.pythonhosted.org/packages/9f/4c/52bc230696f618d60566c373c2f73324de043a34c21d340d051915f4fbe0/adaptive_trainer-0.0.5.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2025-07-15 15:45:01",
"github": true,
"gitlab": false,
"bitbucket": false,
"codeberg": false,
"github_user": "samarthpusalkar",
"github_project": "Adaptive_Trainer",
"travis_ci": false,
"coveralls": false,
"github_actions": true,
"requirements": [
{
"name": "torch",
"specs": [
[
">=",
"2.6.0"
]
]
},
{
"name": "datasets",
"specs": [
[
">=",
"3.6.0"
]
]
},
{
"name": "transformers",
"specs": [
[
">=",
"4.51.3"
]
]
},
{
"name": "huggingface_hub",
"specs": [
[
">=",
"0.31.1"
]
]
},
{
"name": "wandb",
"specs": [
[
">=",
"0.19.11"
]
]
},
{
"name": "accelerate",
"specs": [
[
">=",
"1.6.0"
]
]
},
{
"name": "bitsandbytes",
"specs": [
[
">=",
"0.45.5"
]
]
},
{
"name": null,
"specs": []
}
],
"lcname": "adaptive-trainer"
}