linmult

Name	linmult JSON
Version	1.8.1 JSON
	download
home_page	None
Summary	General-purpose Multimodal Transformer with Linear Complexity Attention Mechanism.
upload_time	2025-08-05 00:17:28
maintainer	None
docs_url	None
author	None
requires_python	>=3.11
license	None
keywords	linear-complexity attention multimodal transformer
VCS
bugtrack_url
requirements	torch pyyaml
Travis-CI	No Travis.
coveralls test coverage	No coveralls.

            # LinMulT

[![python](https://img.shields.io/badge/Python-3.11-3776AB.svg?style=flat&logo=python&logoColor=white)](https://www.python.org)
[![pytorch](https://img.shields.io/badge/PyTorch-2.5.1-EE4C2C.svg?style=flat&logo=pytorch)](https://pytorch.org)

General-purpose Multimodal Transformer with Linear-Complexity Attention Mechanism.

# Setup

### Install package from PyPI

```
pip install linmult
```

### Install package for development

```
git clone https://github.com/fodorad/LinMulT
cd LinMulT
pip install -e .
pip install -U -r requirements.txt
python -m unittest
```

# Quick start

The following use cases demonstrate some basic, then more advanced functionality using the LinT and LinMulT models. For better coverage of configurations, refer to the test cases provided in the **linmult/test** directory.

## LinT: linear-complexity transformer for a single input sequence

### Input sequence without mask, single output head

```
import torch
from linmult import LinT

batch_size = 8
time_dim_1 = 1500
feature_dim_1 = 25
output_dim_1 = 5

x = torch.rand((batch_size, time_dim_1, feature_dim_1))

model = LinT(
    {
        'input_feature_dim': feature_dim_1,
        'heads': [{'name': 'head_0', 'type': 'simple', 'output_dim': output_dim_1}]
    }
)
output_heads = model(x)
assert output_heads['head_0'].shape == (batch_size, time_dim_1, output_dim_1)
```

### Input sequence without mask, single aggregated output head

Note, that the time dimension aggregation is applied within the model.

```
import torch
from linmult import LinT

batch_size = 8
time_dim_1 = 1500
feature_dim_1 = 25
output_dim_1 = 5

x = torch.rand((batch_size, time_dim_1, feature_dim_1))

model = LinT(
    {
        'input_feature_dim': feature_dim_1,
        'heads': [{'name': 'head_0', 'type': 'simple', 'output_dim': output_dim_1}],
        'time_dim_reducer': 'attentionpool'
    }
)
output_heads = model(x)
assert output_heads['head_0'].shape == (batch_size, output_dim_1)
```

### Input sequence without mask, single aggregated output head

Note, that the time dimension aggregation is applied to the model outputs.

```
import torch
from linmult import LinT, apply_logit_aggregation

batch_size = 8
time_dim_1 = 1500
feature_dim_1 = 25
output_dim_1 = 5

x = torch.rand((batch_size, time_dim_1, feature_dim_1))

model = LinT(
    {
        'input_feature_dim': feature_dim_1,
        'heads': [{'name': 'head_0', 'type': 'simple', 'output_dim': output_dim_1}]
    }
)
output_heads = model(x)
assert output_heads['head_0'].shape == (batch_size, time_dim_1, output_dim_1)

output = apply_logit_aggregation(x=output_heads['head_0'], method='meanpooling')
assert output.shape == (batch_size, output_dim_1)
```

### Input sequence with a mask, multiple aggregated output head

```
import torch
from linmult import LinT, apply_logit_aggregation

batch_size = 8
time_dim_1 = 50
feature_dim_1 = 25
output_dim_1 = 5
output_dim_2 = 6

x = torch.rand((batch_size, time_dim_1, feature_dim_1))
mask = (torch.arange(x.size(1)).unsqueeze(0) < x.size(1) - 10).expand(batch_size, -1).bool() # Shape: (batch_size, time_dim_1) 

model = LinT(
    {
        'input_feature_dim': feature_dim_1,
        'heads': [
            {'name': 'head_0', 'type': 'simple', 'output_dim': output_dim_1},
            {'name': 'head_1', 'type': 'simple', 'output_dim': output_dim_2}
        ]
    }
)
output_heads = model(x, mask)
assert output_heads['head_0'].shape == (batch_size, time_dim_1, output_dim_1)
assert output_heads['head_1'].shape == (batch_size, time_dim_1, output_dim_2)

output_0 = apply_logit_aggregation(x=output_heads['head_0'], method='meanpooling')
output_1 = apply_logit_aggregation(x=output_heads['head_1'], method='meanpooling')
assert output_0.shape == (batch_size, output_dim_1)
assert output_1.shape == (batch_size, output_dim_2)
```

### Input sequence with a mask, a sequence and an aggregated output head 

```
import torch
from linmult import LinT, apply_logit_aggregation

batch_size = 8
time_dim_1 = 50
feature_dim_1 = 25
output_dim_1 = 5
output_dim_2 = 6

x = torch.rand((batch_size, time_dim_1, feature_dim_1))
mask = (torch.arange(x.size(1)).unsqueeze(0) < x.size(1) - 10).expand(batch_size, -1).bool() # Shape: (batch_size, time_dim_1) 

model = LinT(
    {
        'input_feature_dim': feature_dim_1,
        'heads': [
            {'name': 'head_0', 'type': 'sequence', 'output_dim': output_dim_1},
            {'name': 'head_1', 'type': 'sequence_aggregation', 'output_dim': output_dim_2}
        ]
    }
)
output_heads = model(x, mask)
assert output_heads['head_0'].shape == (batch_size, time_dim_1, output_dim_1)
assert output_heads['head_1'].shape == (batch_size, output_dim_2)

output_0 = apply_logit_aggregation(x=output_heads['head_0'], method='meanpooling')
assert output_0.shape == (batch_size, output_dim_1)
```

## LinMulT: linear-complexity multimodal transformer for multiple input sequences

### 2 input sequences with same time dimensions, single aggregated output

```
import torch
from linmult import LinMulT, apply_logit_aggregation

batch_size = 8
time_dim = 450
feature_dim_1, feature_dim_2 = 25, 35
output_dim_1 = 5

x_1 = torch.rand((batch_size, time_dim, feature_dim_1))
x_2 = torch.rand((batch_size, time_dim, feature_dim_2))

model = LinMulT(
    {
        'input_feature_dim': [feature_dim_1, feature_dim_2],
        'heads': [{'name': 'head_0', 'type': 'simple', 'output_dim': output_dim_1}]
    }
)
output_heads = model([x_1, x_2])
assert output_heads['head_0'].shape == (batch_size, time_dim, output_dim_1)

output = apply_logit_aggregation(x=output_heads['head_0'], method='meanpooling')
assert output.shape == (batch_size, output_dim_1)
```

### 3 input sequences with different time dimensions, masks, multiple aggregated output heads

```
import torch
from linmult import LinMulT

batch_size = 8
time_dim_1, time_dim_2, time_dim_3 = 1500, 450, 450
feature_dim_1, feature_dim_2, feature_dim_3 = 25, 35, 256
output_dim_1 = 5
output_dim_2 = 6

x_1 = torch.rand((batch_size, time_dim_1, feature_dim_1))
x_2 = torch.rand((batch_size, time_dim_2, feature_dim_2))
x_3 = torch.rand((batch_size, time_dim_3, feature_dim_3))
mask_1 = (torch.arange(x_1.size(1)).unsqueeze(0) < x_1.size(1) - 10).expand(batch_size, -1).bool() # Shape: (batch_size, time_dim_1) 
mask_2 = (torch.arange(x_2.size(1)).unsqueeze(0) < x_2.size(1) - 10).expand(batch_size, -1).bool() # Shape: (batch_size, time_dim_2) 
mask_3 = (torch.arange(x_3.size(1)).unsqueeze(0) < x_3.size(1) - 10).expand(batch_size, -1).bool() # Shape: (batch_size, time_dim_3) 

model = LinMulT(
    {
        'input_feature_dim': [feature_dim_1, feature_dim_2, feature_dim_3],
        'heads': [
            {'name': 'head_0', 'type': 'simple', 'output_dim': output_dim_1},
            {'name': 'head_1', 'type': 'simple', 'output_dim': output_dim_2}
        ],
        'time_dim_reducer': 'gap',
    }
)
output_heads = model([x_1, x_2, x_3], [mask_1, mask_2, mask_3])
assert output_heads['head_0'].shape == (batch_size, output_dim_1)
assert output_heads['head_1'].shape == (batch_size, output_dim_2)
```

### 3 input sequences with different time dimensions, a missing input, enhanced multimodal signal module

```
import torch
from linmult import LinMulT, apply_logit_aggregation

batch_size = 8
time_dim_1, time_dim_2, time_dim_3 = 1500, 450, 450
feature_dim_1, feature_dim_2, feature_dim_3 = 25, 35, 256
output_dim_1 = 5

x_1 = torch.rand((batch_size, time_dim_1, feature_dim_1))
x_2 = torch.rand((batch_size, time_dim_2, feature_dim_2))
x_3 = torch.rand((batch_size, time_dim_3, feature_dim_3))
mask_1 = (torch.arange(x_1.size(1)).unsqueeze(0) < x_1.size(1) - 10).expand(batch_size, -1).bool() # Shape: (batch_size, time_dim_1) 
mask_2 = (torch.arange(x_2.size(1)).unsqueeze(0) < x_2.size(1) - 10).expand(batch_size, -1).bool() # Shape: (batch_size, time_dim_2) 
mask_3f = torch.zeros(size=x_3.size()[:2], dtype=bool) # Shape: (B, T_3)

model = LinMulT(
    {
        'input_feature_dim': [feature_dim_1, feature_dim_2, feature_dim_3],
        'heads': [
            {'name': 'head_0', 'type': 'simple', 'output_dim': output_dim_1},
            {'name': 'head_1', 'type': 'simple', 'output_dim': output_dim_2}
        ],
        'multimodal_signal': True,
        'time_dim_aligner': 'amp',
        'tam_fusion': True,
        'aligned_time_dim': time_dim_2,
    }
)
output_heads = model([x_1, x_2, x_3], [mask_1, mask_2, mask_3f])
assert output_heads['head_0'].shape == (batch_size, time_dim_2, output_dim_1)
assert output_heads['head_1'].shape == (batch_size, time_dim_2, output_dim_2)

output = apply_logit_aggregation(x=output_heads['head_0'], method='meanpooling')
assert output.shape == (batch_size, output_dim_1)
```


### 3 input sequences with different time dimensions, a missing input, enhanced multimodal signal module with multiple task-specific (sequence and aggregated sequence) output heads

```
import torch
from linmult import LinMulT, apply_logit_aggregation

batch_size = 8
time_dim_1, time_dim_2, time_dim_3 = 1500, 450, 450
feature_dim_1, feature_dim_2, feature_dim_3 = 25, 35, 256
output_dim_1 = 5

x_1 = torch.rand((batch_size, time_dim_1, feature_dim_1))
x_2 = torch.rand((batch_size, time_dim_2, feature_dim_2))
x_3 = torch.rand((batch_size, time_dim_3, feature_dim_3))
mask_1 = (torch.arange(x_1.size(1)).unsqueeze(0) < x_1.size(1) - 10).expand(batch_size, -1).bool() # Shape: (batch_size, time_dim_1) 
mask_2 = (torch.arange(x_2.size(1)).unsqueeze(0) < x_2.size(1) - 10).expand(batch_size, -1).bool() # Shape: (batch_size, time_dim_2) 
mask_3f = torch.zeros(size=x_3.size()[:2], dtype=bool) # Shape: (B, T_3)

model = LinMulT(
    {
        'input_feature_dim': [feature_dim_1, feature_dim_2, feature_dim_3],
        'heads': [
            {'name': 'head_0', 'type': 'sequence', 'output_dim': output_dim_1},
            {'name': 'head_1', 'type': 'sequence_aggregation', 'output_dim': output_dim_2}
        ],
        'multimodal_signal': True,
        'time_dim_aligner': 'amp',
        'tam_fusion': True,
        'aligned_time_dim': time_dim_2,
    }
)
output_heads = model([x_1, x_2, x_3], [mask_1, mask_2, mask_3f])
assert output_heads['head_0'].shape == (batch_size, time_dim_2, output_dim_1)
assert output_heads['head_1'].shape == (batch_size, output_dim_2)

output = apply_logit_aggregation(x=output_heads['head_0'], method='meanpooling')
assert output.shape == (batch_size, output_dim_1)
```

### Using a config file

```
import torch
from linmult import LinMulT, apply_logit_aggregation, load_config

batch_size = 8
time_dim_1, time_dim_2, time_dim_3 = 300, 300, 500
feature_dim_1, feature_dim_2, feature_dim_3 = 25, 41, 768
output_dim_1 = 7
output_dim_2 = 2

x_1 = torch.rand((batch_size, time_dim_1, feature_dim_1))
x_2 = torch.rand((batch_size, time_dim_2, feature_dim_2))
x_3 = torch.rand((batch_size, time_dim_3, feature_dim_3))

config = load_config("configs/LinMulT.yaml")
model = LinMulT(config)

output_heads = model([x_1, x_2, x_3])
assert output_heads['head_0'].shape == (batch_size, output_dim_1)
assert output_heads['head_1'].shape == (batch_size, time_dim_1, output_dim_2)

output = apply_logit_aggregation(x=output_heads['head_1'], method='meanpooling')
assert output.shape == (batch_size, output_dim_2)
```

# Similar projects using LinMulT

### (2023) BlinkLinMulT

LinMulT is trained for blink presence detection and eye state recognition tasks.
Our results demonstrate comparable or superior performance compared to state-of-the-art models on 2 tasks, using 7 public benchmark databases.

* paper: BlinkLinMulT: Transformer-based Eye Blink Detection ([pdf](https://adamfodor.com/pdf/2023_Fodor_Adam_MDPI_BlinkLinMulT.pdf), [website](https://www.mdpi.com/2313-433X/9/10/196))
* code: https://github.com/fodorad/BlinkLinMulT

### (2022) PersonalityLinMulT

LinMulT is trained for Big Five personality trait estimation using the First Impressions V2 dataset and sentiment estimation using the MOSI and MOSEI datasets.

* paper: Multimodal Sentiment and Personality Perception Under Speech: A Comparison of Transformer-based Architectures ([pdf](https://proceedings.mlr.press/v173/fodor22a/fodor22a.pdf), [website](https://proceedings.mlr.press/v173/fodor22a.html))
* code: https://github.com/fodorad/PersonalityLinMulT

# Citation - BibTex

If you found our research helpful or influential please consider citing:

### (2023) LinMulT for blink presence detection and eye state recognition:

```
@article{blinklinmult-fodor23,
  title = {BlinkLinMulT: Transformer-based Eye Blink Detection},
  author = {Fodor, {\'A}d{\'a}m and Fenech, Kristian and L{\H{o}}rincz, Andr{\'a}s},
  journal = {...}
  pages = {1--19},
  year = {2023}
}
```

### (2022) LinMulT for personality trait and sentiment estimation:

```
@InProceedings{pmlr-v173-fodor22a,
  title = {Multimodal Sentiment and Personality Perception Under Speech: A Comparison of Transformer-based Architectures},
  author = {Fodor, {\'A}d{\'a}m and Saboundji, Rachid R. and Jacques Junior, Julio C. S. and Escalera, Sergio and Gallardo-Pujol, David and L{\H{o}}rincz, Andr{\'a}s},
  booktitle = {Understanding Social Behavior in Dyadic and Small Group Interactions},
  pages = {218--241},
  year = {2022},
  editor = {Palmero, Cristina and Jacques Junior, Julio C. S. and Clapés, Albert and Guyon, Isabelle and Tu, Wei-Wei and Moeslund, Thomas B. and Escalera, Sergio},
  volume = {173},
  series = {Proceedings of Machine Learning Research},
  month = {16 Oct},
  publisher = {PMLR},
  pdf = {https://proceedings.mlr.press/v173/fodor22a/fodor22a.pdf},
  url = {https://proceedings.mlr.press/v173/fodor22a.html}
}
```

# Acknowledgement

The code is inspired by the following two materials:

### Multimodal Transformer:

* paper: Multimodal Transformer for Unaligned Multimodal Language Sequences ([1906.00295](https://arxiv.org/pdf/1906.00295.pdf))
* code: https://github.com/yaohungt/Multimodal-Transformer

### Linear Attention:

* paper: Transformers are RNNs: Fast Autoregressive Transformers with Linear Attention ([2006.16236](https://arxiv.org/pdf/2006.16236.pdf))
* code: https://github.com/idiap/fast-transformers

# Contact

* Ádám Fodor (fodorad201@gmail.com) [[website](https://adamfodor.com)]

Raw data

            {
    "_id": null,
    "home_page": null,
    "name": "linmult",
    "maintainer": null,
    "docs_url": null,
    "requires_python": ">=3.11",
    "maintainer_email": null,
    "keywords": "linear-complexity attention, multimodal, transformer",
    "author": null,
    "author_email": "fodorad <foauaai@inf.elte.hu>",
    "download_url": "https://files.pythonhosted.org/packages/8b/ae/074fda421bf2af6470609a159786dcd6b77b2c0cf9520d20a2e4ea6c3187/linmult-1.8.1.tar.gz",
    "platform": null,
    "description": "# LinMulT\n\n[![python](https://img.shields.io/badge/Python-3.11-3776AB.svg?style=flat&logo=python&logoColor=white)](https://www.python.org)\n[![pytorch](https://img.shields.io/badge/PyTorch-2.5.1-EE4C2C.svg?style=flat&logo=pytorch)](https://pytorch.org)\n\nGeneral-purpose Multimodal Transformer with Linear-Complexity Attention Mechanism.\n\n# Setup\n\n### Install package from PyPI\n\n```\npip install linmult\n```\n\n### Install package for development\n\n```\ngit clone https://github.com/fodorad/LinMulT\ncd LinMulT\npip install -e .\npip install -U -r requirements.txt\npython -m unittest\n```\n\n# Quick start\n\nThe following use cases demonstrate some basic, then more advanced functionality using the LinT and LinMulT models. For better coverage of configurations, refer to the test cases provided in the **linmult/test** directory.\n\n## LinT: linear-complexity transformer for a single input sequence\n\n### Input sequence without mask, single output head\n\n```\nimport torch\nfrom linmult import LinT\n\nbatch_size = 8\ntime_dim_1 = 1500\nfeature_dim_1 = 25\noutput_dim_1 = 5\n\nx = torch.rand((batch_size, time_dim_1, feature_dim_1))\n\nmodel = LinT(\n    {\n        'input_feature_dim': feature_dim_1,\n        'heads': [{'name': 'head_0', 'type': 'simple', 'output_dim': output_dim_1}]\n    }\n)\noutput_heads = model(x)\nassert output_heads['head_0'].shape == (batch_size, time_dim_1, output_dim_1)\n```\n\n### Input sequence without mask, single aggregated output head\n\nNote, that the time dimension aggregation is applied within the model.\n\n```\nimport torch\nfrom linmult import LinT\n\nbatch_size = 8\ntime_dim_1 = 1500\nfeature_dim_1 = 25\noutput_dim_1 = 5\n\nx = torch.rand((batch_size, time_dim_1, feature_dim_1))\n\nmodel = LinT(\n    {\n        'input_feature_dim': feature_dim_1,\n        'heads': [{'name': 'head_0', 'type': 'simple', 'output_dim': output_dim_1}],\n        'time_dim_reducer': 'attentionpool'\n    }\n)\noutput_heads = model(x)\nassert output_heads['head_0'].shape == (batch_size, output_dim_1)\n```\n\n### Input sequence without mask, single aggregated output head\n\nNote, that the time dimension aggregation is applied to the model outputs.\n\n```\nimport torch\nfrom linmult import LinT, apply_logit_aggregation\n\nbatch_size = 8\ntime_dim_1 = 1500\nfeature_dim_1 = 25\noutput_dim_1 = 5\n\nx = torch.rand((batch_size, time_dim_1, feature_dim_1))\n\nmodel = LinT(\n    {\n        'input_feature_dim': feature_dim_1,\n        'heads': [{'name': 'head_0', 'type': 'simple', 'output_dim': output_dim_1}]\n    }\n)\noutput_heads = model(x)\nassert output_heads['head_0'].shape == (batch_size, time_dim_1, output_dim_1)\n\noutput = apply_logit_aggregation(x=output_heads['head_0'], method='meanpooling')\nassert output.shape == (batch_size, output_dim_1)\n```\n\n### Input sequence with a mask, multiple aggregated output head\n\n```\nimport torch\nfrom linmult import LinT, apply_logit_aggregation\n\nbatch_size = 8\ntime_dim_1 = 50\nfeature_dim_1 = 25\noutput_dim_1 = 5\noutput_dim_2 = 6\n\nx = torch.rand((batch_size, time_dim_1, feature_dim_1))\nmask = (torch.arange(x.size(1)).unsqueeze(0) < x.size(1) - 10).expand(batch_size, -1).bool() # Shape: (batch_size, time_dim_1) \n\nmodel = LinT(\n    {\n        'input_feature_dim': feature_dim_1,\n        'heads': [\n            {'name': 'head_0', 'type': 'simple', 'output_dim': output_dim_1},\n            {'name': 'head_1', 'type': 'simple', 'output_dim': output_dim_2}\n        ]\n    }\n)\noutput_heads = model(x, mask)\nassert output_heads['head_0'].shape == (batch_size, time_dim_1, output_dim_1)\nassert output_heads['head_1'].shape == (batch_size, time_dim_1, output_dim_2)\n\noutput_0 = apply_logit_aggregation(x=output_heads['head_0'], method='meanpooling')\noutput_1 = apply_logit_aggregation(x=output_heads['head_1'], method='meanpooling')\nassert output_0.shape == (batch_size, output_dim_1)\nassert output_1.shape == (batch_size, output_dim_2)\n```\n\n### Input sequence with a mask, a sequence and an aggregated output head \n\n```\nimport torch\nfrom linmult import LinT, apply_logit_aggregation\n\nbatch_size = 8\ntime_dim_1 = 50\nfeature_dim_1 = 25\noutput_dim_1 = 5\noutput_dim_2 = 6\n\nx = torch.rand((batch_size, time_dim_1, feature_dim_1))\nmask = (torch.arange(x.size(1)).unsqueeze(0) < x.size(1) - 10).expand(batch_size, -1).bool() # Shape: (batch_size, time_dim_1) \n\nmodel = LinT(\n    {\n        'input_feature_dim': feature_dim_1,\n        'heads': [\n            {'name': 'head_0', 'type': 'sequence', 'output_dim': output_dim_1},\n            {'name': 'head_1', 'type': 'sequence_aggregation', 'output_dim': output_dim_2}\n        ]\n    }\n)\noutput_heads = model(x, mask)\nassert output_heads['head_0'].shape == (batch_size, time_dim_1, output_dim_1)\nassert output_heads['head_1'].shape == (batch_size, output_dim_2)\n\noutput_0 = apply_logit_aggregation(x=output_heads['head_0'], method='meanpooling')\nassert output_0.shape == (batch_size, output_dim_1)\n```\n\n## LinMulT: linear-complexity multimodal transformer for multiple input sequences\n\n### 2 input sequences with same time dimensions, single aggregated output\n\n```\nimport torch\nfrom linmult import LinMulT, apply_logit_aggregation\n\nbatch_size = 8\ntime_dim = 450\nfeature_dim_1, feature_dim_2 = 25, 35\noutput_dim_1 = 5\n\nx_1 = torch.rand((batch_size, time_dim, feature_dim_1))\nx_2 = torch.rand((batch_size, time_dim, feature_dim_2))\n\nmodel = LinMulT(\n    {\n        'input_feature_dim': [feature_dim_1, feature_dim_2],\n        'heads': [{'name': 'head_0', 'type': 'simple', 'output_dim': output_dim_1}]\n    }\n)\noutput_heads = model([x_1, x_2])\nassert output_heads['head_0'].shape == (batch_size, time_dim, output_dim_1)\n\noutput = apply_logit_aggregation(x=output_heads['head_0'], method='meanpooling')\nassert output.shape == (batch_size, output_dim_1)\n```\n\n### 3 input sequences with different time dimensions, masks, multiple aggregated output heads\n\n```\nimport torch\nfrom linmult import LinMulT\n\nbatch_size = 8\ntime_dim_1, time_dim_2, time_dim_3 = 1500, 450, 450\nfeature_dim_1, feature_dim_2, feature_dim_3 = 25, 35, 256\noutput_dim_1 = 5\noutput_dim_2 = 6\n\nx_1 = torch.rand((batch_size, time_dim_1, feature_dim_1))\nx_2 = torch.rand((batch_size, time_dim_2, feature_dim_2))\nx_3 = torch.rand((batch_size, time_dim_3, feature_dim_3))\nmask_1 = (torch.arange(x_1.size(1)).unsqueeze(0) < x_1.size(1) - 10).expand(batch_size, -1).bool() # Shape: (batch_size, time_dim_1) \nmask_2 = (torch.arange(x_2.size(1)).unsqueeze(0) < x_2.size(1) - 10).expand(batch_size, -1).bool() # Shape: (batch_size, time_dim_2) \nmask_3 = (torch.arange(x_3.size(1)).unsqueeze(0) < x_3.size(1) - 10).expand(batch_size, -1).bool() # Shape: (batch_size, time_dim_3) \n\nmodel = LinMulT(\n    {\n        'input_feature_dim': [feature_dim_1, feature_dim_2, feature_dim_3],\n        'heads': [\n            {'name': 'head_0', 'type': 'simple', 'output_dim': output_dim_1},\n            {'name': 'head_1', 'type': 'simple', 'output_dim': output_dim_2}\n        ],\n        'time_dim_reducer': 'gap',\n    }\n)\noutput_heads = model([x_1, x_2, x_3], [mask_1, mask_2, mask_3])\nassert output_heads['head_0'].shape == (batch_size, output_dim_1)\nassert output_heads['head_1'].shape == (batch_size, output_dim_2)\n```\n\n### 3 input sequences with different time dimensions, a missing input, enhanced multimodal signal module\n\n```\nimport torch\nfrom linmult import LinMulT, apply_logit_aggregation\n\nbatch_size = 8\ntime_dim_1, time_dim_2, time_dim_3 = 1500, 450, 450\nfeature_dim_1, feature_dim_2, feature_dim_3 = 25, 35, 256\noutput_dim_1 = 5\n\nx_1 = torch.rand((batch_size, time_dim_1, feature_dim_1))\nx_2 = torch.rand((batch_size, time_dim_2, feature_dim_2))\nx_3 = torch.rand((batch_size, time_dim_3, feature_dim_3))\nmask_1 = (torch.arange(x_1.size(1)).unsqueeze(0) < x_1.size(1) - 10).expand(batch_size, -1).bool() # Shape: (batch_size, time_dim_1) \nmask_2 = (torch.arange(x_2.size(1)).unsqueeze(0) < x_2.size(1) - 10).expand(batch_size, -1).bool() # Shape: (batch_size, time_dim_2) \nmask_3f = torch.zeros(size=x_3.size()[:2], dtype=bool) # Shape: (B, T_3)\n\nmodel = LinMulT(\n    {\n        'input_feature_dim': [feature_dim_1, feature_dim_2, feature_dim_3],\n        'heads': [\n            {'name': 'head_0', 'type': 'simple', 'output_dim': output_dim_1},\n            {'name': 'head_1', 'type': 'simple', 'output_dim': output_dim_2}\n        ],\n        'multimodal_signal': True,\n        'time_dim_aligner': 'amp',\n        'tam_fusion': True,\n        'aligned_time_dim': time_dim_2,\n    }\n)\noutput_heads = model([x_1, x_2, x_3], [mask_1, mask_2, mask_3f])\nassert output_heads['head_0'].shape == (batch_size, time_dim_2, output_dim_1)\nassert output_heads['head_1'].shape == (batch_size, time_dim_2, output_dim_2)\n\noutput = apply_logit_aggregation(x=output_heads['head_0'], method='meanpooling')\nassert output.shape == (batch_size, output_dim_1)\n```\n\n\n### 3 input sequences with different time dimensions, a missing input, enhanced multimodal signal module with multiple task-specific (sequence and aggregated sequence) output heads\n\n```\nimport torch\nfrom linmult import LinMulT, apply_logit_aggregation\n\nbatch_size = 8\ntime_dim_1, time_dim_2, time_dim_3 = 1500, 450, 450\nfeature_dim_1, feature_dim_2, feature_dim_3 = 25, 35, 256\noutput_dim_1 = 5\n\nx_1 = torch.rand((batch_size, time_dim_1, feature_dim_1))\nx_2 = torch.rand((batch_size, time_dim_2, feature_dim_2))\nx_3 = torch.rand((batch_size, time_dim_3, feature_dim_3))\nmask_1 = (torch.arange(x_1.size(1)).unsqueeze(0) < x_1.size(1) - 10).expand(batch_size, -1).bool() # Shape: (batch_size, time_dim_1) \nmask_2 = (torch.arange(x_2.size(1)).unsqueeze(0) < x_2.size(1) - 10).expand(batch_size, -1).bool() # Shape: (batch_size, time_dim_2) \nmask_3f = torch.zeros(size=x_3.size()[:2], dtype=bool) # Shape: (B, T_3)\n\nmodel = LinMulT(\n    {\n        'input_feature_dim': [feature_dim_1, feature_dim_2, feature_dim_3],\n        'heads': [\n            {'name': 'head_0', 'type': 'sequence', 'output_dim': output_dim_1},\n            {'name': 'head_1', 'type': 'sequence_aggregation', 'output_dim': output_dim_2}\n        ],\n        'multimodal_signal': True,\n        'time_dim_aligner': 'amp',\n        'tam_fusion': True,\n        'aligned_time_dim': time_dim_2,\n    }\n)\noutput_heads = model([x_1, x_2, x_3], [mask_1, mask_2, mask_3f])\nassert output_heads['head_0'].shape == (batch_size, time_dim_2, output_dim_1)\nassert output_heads['head_1'].shape == (batch_size, output_dim_2)\n\noutput = apply_logit_aggregation(x=output_heads['head_0'], method='meanpooling')\nassert output.shape == (batch_size, output_dim_1)\n```\n\n### Using a config file\n\n```\nimport torch\nfrom linmult import LinMulT, apply_logit_aggregation, load_config\n\nbatch_size = 8\ntime_dim_1, time_dim_2, time_dim_3 = 300, 300, 500\nfeature_dim_1, feature_dim_2, feature_dim_3 = 25, 41, 768\noutput_dim_1 = 7\noutput_dim_2 = 2\n\nx_1 = torch.rand((batch_size, time_dim_1, feature_dim_1))\nx_2 = torch.rand((batch_size, time_dim_2, feature_dim_2))\nx_3 = torch.rand((batch_size, time_dim_3, feature_dim_3))\n\nconfig = load_config(\"configs/LinMulT.yaml\")\nmodel = LinMulT(config)\n\noutput_heads = model([x_1, x_2, x_3])\nassert output_heads['head_0'].shape == (batch_size, output_dim_1)\nassert output_heads['head_1'].shape == (batch_size, time_dim_1, output_dim_2)\n\noutput = apply_logit_aggregation(x=output_heads['head_1'], method='meanpooling')\nassert output.shape == (batch_size, output_dim_2)\n```\n\n# Similar projects using LinMulT\n\n### (2023) BlinkLinMulT\n\nLinMulT is trained for blink presence detection and eye state recognition tasks.\nOur results demonstrate comparable or superior performance compared to state-of-the-art models on 2 tasks, using 7 public benchmark databases.\n\n* paper: BlinkLinMulT: Transformer-based Eye Blink Detection ([pdf](https://adamfodor.com/pdf/2023_Fodor_Adam_MDPI_BlinkLinMulT.pdf), [website](https://www.mdpi.com/2313-433X/9/10/196))\n* code: https://github.com/fodorad/BlinkLinMulT\n\n### (2022) PersonalityLinMulT\n\nLinMulT is trained for Big Five personality trait estimation using the First Impressions V2 dataset and sentiment estimation using the MOSI and MOSEI datasets.\n\n* paper: Multimodal Sentiment and Personality Perception Under Speech: A Comparison of Transformer-based Architectures ([pdf](https://proceedings.mlr.press/v173/fodor22a/fodor22a.pdf), [website](https://proceedings.mlr.press/v173/fodor22a.html))\n* code: https://github.com/fodorad/PersonalityLinMulT\n\n# Citation - BibTex\n\nIf you found our research helpful or influential please consider citing:\n\n### (2023) LinMulT for blink presence detection and eye state recognition:\n\n```\n@article{blinklinmult-fodor23,\n  title = {BlinkLinMulT: Transformer-based Eye Blink Detection},\n  author = {Fodor, {\\'A}d{\\'a}m and Fenech, Kristian and L{\\H{o}}rincz, Andr{\\'a}s},\n  journal = {...}\n  pages = {1--19},\n  year = {2023}\n}\n```\n\n### (2022) LinMulT for personality trait and sentiment estimation:\n\n```\n@InProceedings{pmlr-v173-fodor22a,\n  title = {Multimodal Sentiment and Personality Perception Under Speech: A Comparison of Transformer-based Architectures},\n  author = {Fodor, {\\'A}d{\\'a}m and Saboundji, Rachid R. and Jacques Junior, Julio C. S. and Escalera, Sergio and Gallardo-Pujol, David and L{\\H{o}}rincz, Andr{\\'a}s},\n  booktitle = {Understanding Social Behavior in Dyadic and Small Group Interactions},\n  pages = {218--241},\n  year = {2022},\n  editor = {Palmero, Cristina and Jacques Junior, Julio C. S. and Clap\u00e9s, Albert and Guyon, Isabelle and Tu, Wei-Wei and Moeslund, Thomas B. and Escalera, Sergio},\n  volume = {173},\n  series = {Proceedings of Machine Learning Research},\n  month = {16 Oct},\n  publisher = {PMLR},\n  pdf = {https://proceedings.mlr.press/v173/fodor22a/fodor22a.pdf},\n  url = {https://proceedings.mlr.press/v173/fodor22a.html}\n}\n```\n\n# Acknowledgement\n\nThe code is inspired by the following two materials:\n\n### Multimodal Transformer:\n\n* paper: Multimodal Transformer for Unaligned Multimodal Language Sequences ([1906.00295](https://arxiv.org/pdf/1906.00295.pdf))\n* code: https://github.com/yaohungt/Multimodal-Transformer\n\n### Linear Attention:\n\n* paper: Transformers are RNNs: Fast Autoregressive Transformers with Linear Attention ([2006.16236](https://arxiv.org/pdf/2006.16236.pdf))\n* code: https://github.com/idiap/fast-transformers\n\n# Contact\n\n* \u00c1d\u00e1m Fodor (fodorad201@gmail.com) [[website](https://adamfodor.com)]",
    "bugtrack_url": null,
    "license": null,
    "summary": "General-purpose Multimodal Transformer with Linear Complexity Attention Mechanism.",
    "version": "1.8.1",
    "project_urls": {
        "Documentation": "https://github.com/fodorad/linmult#readme",
        "Issues": "https://github.com/fodorad/linmult/issues",
        "Source": "https://github.com/fodorad/linmult"
    },
    "split_keywords": [
        "linear-complexity attention",
        " multimodal",
        " transformer"
    ],
    "urls": [
        {
            "comment_text": null,
            "digests": {
                "blake2b_256": "dd127a8cbcc37d89cdf3f4ae5d76a83e6089898e5c5fe9f9c36aa0bd67c2609e",
                "md5": "ec8caa62258b72816f9763e85436ac49",
                "sha256": "06ab31744f8e9c234cde16a2db92b6c8c1e853b39582fcb1b0d111b1740dd700"
            },
            "downloads": -1,
            "filename": "linmult-1.8.1-py3-none-any.whl",
            "has_sig": false,
            "md5_digest": "ec8caa62258b72816f9763e85436ac49",
            "packagetype": "bdist_wheel",
            "python_version": "py3",
            "requires_python": ">=3.11",
            "size": 34086,
            "upload_time": "2025-08-05T00:17:29",
            "upload_time_iso_8601": "2025-08-05T00:17:29.723973Z",
            "url": "https://files.pythonhosted.org/packages/dd/12/7a8cbcc37d89cdf3f4ae5d76a83e6089898e5c5fe9f9c36aa0bd67c2609e/linmult-1.8.1-py3-none-any.whl",
            "yanked": false,
            "yanked_reason": null
        },
        {
            "comment_text": null,
            "digests": {
                "blake2b_256": "8bae074fda421bf2af6470609a159786dcd6b77b2c0cf9520d20a2e4ea6c3187",
                "md5": "671003f2674093ccde6e0a1c0ef27fd0",
                "sha256": "54aad6af4df5d435faaa74bf57674894f2dd1485a36face753646dc93b4c024c"
            },
            "downloads": -1,
            "filename": "linmult-1.8.1.tar.gz",
            "has_sig": false,
            "md5_digest": "671003f2674093ccde6e0a1c0ef27fd0",
            "packagetype": "sdist",
            "python_version": "source",
            "requires_python": ">=3.11",
            "size": 28012,
            "upload_time": "2025-08-05T00:17:28",
            "upload_time_iso_8601": "2025-08-05T00:17:28.334220Z",
            "url": "https://files.pythonhosted.org/packages/8b/ae/074fda421bf2af6470609a159786dcd6b77b2c0cf9520d20a2e4ea6c3187/linmult-1.8.1.tar.gz",
            "yanked": false,
            "yanked_reason": null
        }
    ],
    "upload_time": "2025-08-05 00:17:28",
    "github": true,
    "gitlab": false,
    "bitbucket": false,
    "codeberg": false,
    "github_user": "fodorad",
    "github_project": "linmult#readme",
    "travis_ci": false,
    "coveralls": false,
    "github_actions": false,
    "requirements": [
        {
            "name": "torch",
            "specs": [
                [
                    "==",
                    "2.5.1"
                ]
            ]
        },
        {
            "name": "pyyaml",
            "specs": []
        }
    ],
    "lcname": "linmult"
}

None