Publications
Search
Zhang, Hao; Chang, Di; Li, Fang; Soleymani, Mohammad; Ahuja, Narendra
MagicPose4D: Crafting Articulated Models with Appearance and Motion Control Miscellaneous
2024, (Version Number: 1).
@misc{zhang_magicpose4d_2024,
title = {MagicPose4D: Crafting Articulated Models with Appearance and Motion Control},
author = {Hao Zhang and Di Chang and Fang Li and Mohammad Soleymani and Narendra Ahuja},
url = {https://arxiv.org/abs/2405.14017},
doi = {10.48550/ARXIV.2405.14017},
year = {2024},
date = {2024-05-01},
urldate = {2024-06-25},
publisher = {arXiv},
abstract = {With the success of 2D and 3D visual generative models, there is growing interest in generating 4D content. Existing methods primarily rely on text prompts to produce 4D content, but they often fall short of accurately defining complex or rare motions. To address this limitation, we propose MagicPose4D, a novel framework for refined control over both appearance and motion in 4D generation. Unlike traditional methods, MagicPose4D accepts monocular videos as motion prompts, enabling precise and customizable motion generation. MagicPose4D comprises two key modules:
i) Dual-Phase 4D Reconstruction Modulevphantom which operates in two phases. The first phase focuses on capturing the model's shape using accurate 2D supervision and less accurate but geometrically informative 3D pseudo-supervision without imposing skeleton constraints. The second phase refines the model using more accurate pseudo-3D supervision, obtained in the first phase and introduces kinematic chain-based skeleton constraints to ensure physical plausibility. Additionally, we propose a Global-local Chamfer loss that aligns the overall distribution of predicted mesh vertices with the supervision while maintaining part-level alignment without extra annotations.
ii) Cross-category Motion Transfer Modulevphantom leverages the predictions from the 4D reconstruction module and uses a kinematic-chain-based skeleton to achieve cross-category motion transfer. It ensures smooth transitions between frames through dynamic rigidity, facilitating robust generalization without additional training.
Through extensive experiments, we demonstrate that MagicPose4D significantly improves the accuracy and consistency of 4D content generation, outperforming existing methods in various benchmarks.},
note = {Version Number: 1},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
i) Dual-Phase 4D Reconstruction Modulevphantom which operates in two phases. The first phase focuses on capturing the model's shape using accurate 2D supervision and less accurate but geometrically informative 3D pseudo-supervision without imposing skeleton constraints. The second phase refines the model using more accurate pseudo-3D supervision, obtained in the first phase and introduces kinematic chain-based skeleton constraints to ensure physical plausibility. Additionally, we propose a Global-local Chamfer loss that aligns the overall distribution of predicted mesh vertices with the supervision while maintaining part-level alignment without extra annotations.
ii) Cross-category Motion Transfer Modulevphantom leverages the predictions from the 4D reconstruction module and uses a kinematic-chain-based skeleton to achieve cross-category motion transfer. It ensures smooth transitions between frames through dynamic rigidity, facilitating robust generalization without additional training.
Through extensive experiments, we demonstrate that MagicPose4D significantly improves the accuracy and consistency of 4D content generation, outperforming existing methods in various benchmarks.
Chang, Di; Shi, Yichun; Gao, Quankai; Fu, Jessica; Xu, Hongyi; Song, Guoxian; Yan, Qing; Zhu, Yizhe; Yang, Xiao; Soleymani, Mohammad
MagicPose: Realistic Human Poses and Facial Expressions Retargeting with Identity-aware Diffusion Miscellaneous
2024, (arXiv:2311.12052 [cs]).
@misc{chang_magicpose_2024,
title = {MagicPose: Realistic Human Poses and Facial Expressions Retargeting with Identity-aware Diffusion},
author = {Di Chang and Yichun Shi and Quankai Gao and Jessica Fu and Hongyi Xu and Guoxian Song and Qing Yan and Yizhe Zhu and Xiao Yang and Mohammad Soleymani},
url = {http://arxiv.org/abs/2311.12052},
year = {2024},
date = {2024-05-01},
urldate = {2024-07-18},
publisher = {arXiv},
abstract = {In this work, we propose MagicPose, a diffusion-based model for 2D human pose and facial expression retargeting. Specifically, given a reference image, we aim to generate a person's new images by controlling the poses and facial expressions while keeping the identity unchanged. To this end, we propose a two-stage training strategy to disentangle human motions and appearance (e.g., facial expressions, skin tone and dressing), consisting of (1) the pre-training of an appearance-control block and (2) learning appearance-disentangled pose control. Our novel design enables robust appearance control over generated human images, including body, facial attributes, and even background. By leveraging the prior knowledge of image diffusion models, MagicPose generalizes well to unseen human identities and complex poses without the need for additional fine-tuning. Moreover, the proposed model is easy to use and can be considered as a plug-in module/extension to Stable Diffusion. The code is available at: https://github.com/Boese0601/MagicDance},
note = {arXiv:2311.12052 [cs]},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
Bohy, Hugo; Tran, Minh; Haddad, Kevin El; Dutoit, Thierry; Soleymani, Mohammad
Social-MAE: A Transformer-Based Multimodal Autoencoder for Face and Voice Proceedings Article
In: 2024 IEEE 18th International Conference on Automatic Face and Gesture Recognition (FG), pp. 1–5, IEEE, Istanbul, Turkiye, 2024, ISBN: 9798350394948.
@inproceedings{bohy_social-mae_2024,
title = {Social-MAE: A Transformer-Based Multimodal Autoencoder for Face and Voice},
author = {Hugo Bohy and Minh Tran and Kevin El Haddad and Thierry Dutoit and Mohammad Soleymani},
url = {https://ieeexplore.ieee.org/document/10581940/},
doi = {10.1109/FG59268.2024.10581940},
isbn = {9798350394948},
year = {2024},
date = {2024-05-01},
urldate = {2024-07-18},
booktitle = {2024 IEEE 18th International Conference on Automatic Face and Gesture Recognition (FG)},
pages = {1–5},
publisher = {IEEE},
address = {Istanbul, Turkiye},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Soleymani, Mohammad; Rahmani, Mehdi; Bigdeli, Nooshin
Robust Tube-Based Reference Tracking Nonlinear Model Predictive Control for Wind Turbines Journal Article
In: IEEE Trans. Automat. Sci. Eng., pp. 1–13, 2024, ISSN: 1545-5955, 1558-3783.
@article{soleymani_robust_2024,
title = {Robust Tube-Based Reference Tracking Nonlinear Model Predictive Control for Wind Turbines},
author = {Mohammad Soleymani and Mehdi Rahmani and Nooshin Bigdeli},
url = {https://ieeexplore.ieee.org/document/10495787/},
doi = {10.1109/TASE.2024.3385714},
issn = {1545-5955, 1558-3783},
year = {2024},
date = {2024-04-01},
urldate = {2024-04-16},
journal = {IEEE Trans. Automat. Sci. Eng.},
pages = {1–13},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Soleymani, Mohammad; Kumano, Shiro; Provost, Emily Mower; Bianchi-Berthouze, Nadia; Sano, Akane; Suzuki, Kenji
Guest Editorial Best of ACII 2021 Journal Article
In: IEEE Trans. Affective Comput., vol. 15, no. 2, pp. 376–379, 2024, ISSN: 1949-3045, 2371-9850.
@article{soleymani_guest_2024,
title = {Guest Editorial Best of ACII 2021},
author = {Mohammad Soleymani and Shiro Kumano and Emily Mower Provost and Nadia Bianchi-Berthouze and Akane Sano and Kenji Suzuki},
url = {https://ieeexplore.ieee.org/document/10542496/},
doi = {10.1109/TAFFC.2024.3389249},
issn = {1949-3045, 2371-9850},
year = {2024},
date = {2024-04-01},
urldate = {2024-06-25},
journal = {IEEE Trans. Affective Comput.},
volume = {15},
number = {2},
pages = {376–379},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Tran, Minh; Chang, Di; Siniukov, Maksim; Soleymani, Mohammad
Dyadic Interaction Modeling for Social Behavior Generation Miscellaneous
2024, (arXiv:2403.09069 [cs]).
@misc{tran_dyadic_2024,
title = {Dyadic Interaction Modeling for Social Behavior Generation},
author = {Minh Tran and Di Chang and Maksim Siniukov and Mohammad Soleymani},
url = {http://arxiv.org/abs/2403.09069},
year = {2024},
date = {2024-03-01},
urldate = {2024-03-19},
publisher = {arXiv},
abstract = {Human-human communication is like a delicate dance where listeners and speakers concurrently interact to maintain conversational dynamics. Hence, an effective model for generating listener nonverbal behaviors requires understanding the dyadic context and interaction. In this paper, we present an effective framework for creating 3D facial motions in dyadic interactions. Existing work consider a listener as a reactive agent with reflexive behaviors to the speaker's voice and facial motions. The heart of our framework is Dyadic Interaction Modeling (DIM), a pre-training approach that jointly models speakers' and listeners' motions through masking and contrastive learning to learn representations that capture the dyadic context. To enable the generation of non-deterministic behaviors, we encode both listener and speaker motions into discrete latent representations, through VQ-VAE. The pre-trained model is further fine-tuned for motion generation. Extensive experiments demonstrate the superiority of our framework in generating listener motions, establishing a new state-of-the-art according to the quantitative measures capturing the diversity and realism of generated motions. Qualitative results demonstrate the superior capabilities of the proposed approach in generating diverse and realistic expressions, eye blinks and head gestures.},
note = {arXiv:2403.09069 [cs]},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
Lu, Liupei; Yin, Yufeng; Gu, Yuming; Wu, Yizhen; Prasad, Pratusha; Zhao, Yajie; Soleymani, Mohammad
Leveraging Synthetic Data for Generalizable and Fair Facial Action Unit Detection Miscellaneous
2024, (arXiv:2403.10737 [cs]).
@misc{lu_leveraging_2024,
title = {Leveraging Synthetic Data for Generalizable and Fair Facial Action Unit Detection},
author = {Liupei Lu and Yufeng Yin and Yuming Gu and Yizhen Wu and Pratusha Prasad and Yajie Zhao and Mohammad Soleymani},
url = {http://arxiv.org/abs/2403.10737},
year = {2024},
date = {2024-03-01},
urldate = {2024-04-16},
publisher = {arXiv},
abstract = {Facial action unit (AU) detection is a fundamental block for objective facial expression analysis. Supervised learning approaches require a large amount of manual labeling which is costly. The limited labeled data are also not diverse in terms of gender which can affect model fairness. In this paper, we propose to use synthetically generated data and multi-source domain adaptation (MSDA) to address the problems of the scarcity of labeled data and the diversity of subjects. Specifically, we propose to generate a diverse dataset through synthetic facial expression re-targeting by transferring the expressions from real faces to synthetic avatars. Then, we use MSDA to transfer the AU detection knowledge from a real dataset and the synthetic dataset to a target dataset. Instead of aligning the overall distributions of different domains, we propose Paired Moment Matching (PM2) to align the features of the paired real and synthetic data with the same facial expression. To further improve gender fairness, PM2 matches the features of the real data with a female and a male synthetic image. Our results indicate that synthetic data and the proposed model improve both AU detection performance and fairness across genders, demonstrating its potential to solve AU detection in-the-wild.},
note = {arXiv:2403.10737 [cs]},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
Shi, Zhonghao; O'Connell, Allison; Li, Zongjian; Liu, Siqi; Ayissi, Jennifer; Hoffman, Guy; Soleymani, Mohammad; Matarić, Maja J.
Build Your Own Robot Friend: An Open-Source Learning Module for Accessible and Engaging AI Education Miscellaneous
2024, (arXiv:2402.01647 [cs]).
@misc{shi_build_2024,
title = {Build Your Own Robot Friend: An Open-Source Learning Module for Accessible and Engaging AI Education},
author = {Zhonghao Shi and Allison O'Connell and Zongjian Li and Siqi Liu and Jennifer Ayissi and Guy Hoffman and Mohammad Soleymani and Maja J. Matarić},
url = {http://arxiv.org/abs/2402.01647},
year = {2024},
date = {2024-01-01},
urldate = {2024-02-21},
publisher = {arXiv},
abstract = {As artificial intelligence (AI) is playing an increasingly important role in our society and global economy, AI education and literacy have become necessary components in college and K-12 education to prepare students for an AI-powered society. However, current AI curricula have not yet been made accessible and engaging enough for students and schools from all socio-economic backgrounds with different educational goals. In this work, we developed an open-source learning module for college and high school students, which allows students to build their own robot companion from the ground up. This open platform can be used to provide hands-on experience and introductory knowledge about various aspects of AI, including robotics, machine learning (ML), software engineering, and mechanical engineering. Because of the social and personal nature of a socially assistive robot companion, this module also puts a special emphasis on human-centered AI, enabling students to develop a better understanding of human-AI interaction and AI ethics through hands-on learning activities. With open-source documentation, assembling manuals and affordable materials, students from different socio-economic backgrounds can personalize their learning experience based on their individual educational goals. To evaluate the student-perceived quality of our module, we conducted a usability testing workshop with 15 college students recruited from a minority-serving institution. Our results indicate that our AI module is effective, easy-to-follow, and engaging, and it increases student interest in studying AI/ML and robotics in the future. We hope that this work will contribute toward accessible and engaging AI education in human-AI interaction for college and high school students.},
note = {arXiv:2402.01647 [cs]},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
Zhou, Emily; Soleymani, Mohammad; Matarić, Maja J.
Investigating the Generalizability of Physiological Characteristics of Anxiety Proceedings Article
In: 2023 IEEE International Conference on Bioinformatics and Biomedicine (BIBM), pp. 4848–4855, IEEE, Istanbul, Turkiye, 2023, ISBN: 9798350337488.
@inproceedings{zhou_investigating_2023,
title = {Investigating the Generalizability of Physiological Characteristics of Anxiety},
author = {Emily Zhou and Mohammad Soleymani and Maja J. Matarić},
url = {https://ieeexplore.ieee.org/document/10385292/},
doi = {10.1109/BIBM58861.2023.10385292},
isbn = {9798350337488},
year = {2023},
date = {2023-12-01},
urldate = {2024-04-16},
booktitle = {2023 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)},
pages = {4848–4855},
publisher = {IEEE},
address = {Istanbul, Turkiye},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Chang, Di; Shi, Yichun; Gao, Quankai; Fu, Jessica; Xu, Hongyi; Song, Guoxian; Yan, Qing; Yang, Xiao; Soleymani, Mohammad
MagicDance: Realistic Human Dance Video Generation with Motions & Facial Expressions Transfer Miscellaneous
2023, (arXiv:2311.12052 [cs]).
@misc{chang_magicdance_2023,
title = {MagicDance: Realistic Human Dance Video Generation with Motions & Facial Expressions Transfer},
author = {Di Chang and Yichun Shi and Quankai Gao and Jessica Fu and Hongyi Xu and Guoxian Song and Qing Yan and Xiao Yang and Mohammad Soleymani},
url = {http://arxiv.org/abs/2311.12052},
year = {2023},
date = {2023-11-01},
urldate = {2023-12-07},
publisher = {arXiv},
abstract = {In this work, we propose MagicDance, a diffusion-based model for 2D human motion and facial expression transfer on challenging human dance videos. Specifically, we aim to generate human dance videos of any target identity driven by novel pose sequences while keeping the identity unchanged. To this end, we propose a two-stage training strategy to disentangle human motions and appearance (e.g., facial expressions, skin tone and dressing), consisting of the pretraining of an appearance-control block and fine-tuning of an appearance-pose-joint-control block over human dance poses of the same dataset. Our novel design enables robust appearance control with temporally consistent upper body, facial attributes, and even background. The model also generalizes well on unseen human identities and complex motion sequences without the need for any fine-tuning with additional data with diverse human attributes by leveraging the prior knowledge of image diffusion models. Moreover, the proposed model is easy to use and can be considered as a plug-in module/extension to Stable Diffusion. We also demonstrate the model's ability for zero-shot 2D animation generation, enabling not only the appearance transfer from one identity to another but also allowing for cartoon-like stylization given only pose inputs. Extensive experiments demonstrate our superior performance on the TikTok dataset.},
note = {arXiv:2311.12052 [cs]},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
Tran, Minh; Soleymani, Mohammad
Privacy-preserving Representation Learning for Speech Understanding Miscellaneous
2023, (arXiv:2310.17194 [eess]).
@misc{tran_privacy-preserving_2023,
title = {Privacy-preserving Representation Learning for Speech Understanding},
author = {Minh Tran and Mohammad Soleymani},
url = {http://arxiv.org/abs/2310.17194},
year = {2023},
date = {2023-10-01},
urldate = {2023-12-07},
publisher = {arXiv},
abstract = {Existing privacy-preserving speech representation learning methods target a single application domain. In this paper, we present a novel framework to anonymize utterance-level speech embeddings generated by pre-trained encoders and show its effectiveness for a range of speech classification tasks. Specifically, given the representations from a pre-trained encoder, we train a Transformer to estimate the representations for the same utterances spoken by other speakers. During inference, the extracted representations can be converted into different identities to preserve privacy. We compare the results with the voice anonymization baselines from the VoicePrivacy 2022 challenge. We evaluate our framework on speaker identification for privacy and emotion recognition, depression classification, and intent classification for utility. Our method outperforms the baselines on privacy and utility in paralinguistic tasks and achieves comparable performance for intent classification.},
note = {arXiv:2310.17194 [eess]},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
Ahmed, Tamim; Rikakis, Thanassis; Kelliher, Aisling; Soleymani, Mohammad
ASAR Dataset and Computational Model for Affective State Recognition During ARAT Assessment for Upper Extremity Stroke Survivors Proceedings Article
In: International Cconference on Multimodal Interaction, pp. 11–15, ACM, Paris France, 2023, ISBN: 9798400703218.
@inproceedings{ahmed_asar_2023,
title = {ASAR Dataset and Computational Model for Affective State Recognition During ARAT Assessment for Upper Extremity Stroke Survivors},
author = {Tamim Ahmed and Thanassis Rikakis and Aisling Kelliher and Mohammad Soleymani},
url = {https://dl.acm.org/doi/10.1145/3610661.3617154},
doi = {10.1145/3610661.3617154},
isbn = {9798400703218},
year = {2023},
date = {2023-10-01},
urldate = {2023-12-07},
booktitle = {International Cconference on Multimodal Interaction},
pages = {11–15},
publisher = {ACM},
address = {Paris France},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Andrist, Sean; Bohus, Dan; Li, Zongjian; Soleymani, Mohammad
Platform for Situated Intelligence and OpenSense: A Tutorial on Building Multimodal Interactive Applications for Research Proceedings Article
In: International Cconference on Multimodal Interaction, pp. 105–106, ACM, Paris France, 2023, ISBN: 9798400703218.
@inproceedings{andrist_platform_2023,
title = {Platform for Situated Intelligence and OpenSense: A Tutorial on Building Multimodal Interactive Applications for Research},
author = {Sean Andrist and Dan Bohus and Zongjian Li and Mohammad Soleymani},
url = {https://dl.acm.org/doi/10.1145/3610661.3617603},
doi = {10.1145/3610661.3617603},
isbn = {9798400703218},
year = {2023},
date = {2023-10-01},
urldate = {2023-12-07},
booktitle = {International Cconference on Multimodal Interaction},
pages = {105–106},
publisher = {ACM},
address = {Paris France},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Tran, Trang; Yin, Yufeng; Tavabi, Leili; Delacruz, Joannalyn; Borsari, Brian; Woolley, Joshua D; Scherer, Stefan; Soleymani, Mohammad
Multimodal Analysis and Assessment of Therapist Empathy in Motivational Interviews Proceedings Article
In: INTERNATIONAL CONFERENCE ON MULTIMODAL INTERACTION, pp. 406–415, ACM, Paris France, 2023, ISBN: 9798400700552.
@inproceedings{tran_multimodal_2023,
title = {Multimodal Analysis and Assessment of Therapist Empathy in Motivational Interviews},
author = {Trang Tran and Yufeng Yin and Leili Tavabi and Joannalyn Delacruz and Brian Borsari and Joshua D Woolley and Stefan Scherer and Mohammad Soleymani},
url = {https://dl.acm.org/doi/10.1145/3577190.3614105},
doi = {10.1145/3577190.3614105},
isbn = {9798400700552},
year = {2023},
date = {2023-10-01},
urldate = {2023-12-07},
booktitle = {INTERNATIONAL CONFERENCE ON MULTIMODAL INTERACTION},
pages = {406–415},
publisher = {ACM},
address = {Paris France},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Gainer, Alesia; Aptaker, Allison; Artstein, Ron; Cobbins, David; Core, Mark; Gordon, Carla; Leuski, Anton; Li, Zongjian; Merchant, Chirag; Nelson, David; Soleymani, Mohammad; Traum, David
DIVIS: Digital Interactive Victim Intake Simulator Proceedings Article
In: Proceedings of the 23rd ACM International Conference on Intelligent Virtual Agents, pp. 1–2, ACM, Würzburg Germany, 2023, ISBN: 978-1-4503-9994-4.
@inproceedings{gainer_divis_2023,
title = {DIVIS: Digital Interactive Victim Intake Simulator},
author = {Alesia Gainer and Allison Aptaker and Ron Artstein and David Cobbins and Mark Core and Carla Gordon and Anton Leuski and Zongjian Li and Chirag Merchant and David Nelson and Mohammad Soleymani and David Traum},
url = {https://dl.acm.org/doi/10.1145/3570945.3607328},
doi = {10.1145/3570945.3607328},
isbn = {978-1-4503-9994-4},
year = {2023},
date = {2023-09-01},
urldate = {2024-02-20},
booktitle = {Proceedings of the 23rd ACM International Conference on Intelligent Virtual Agents},
pages = {1–2},
publisher = {ACM},
address = {Würzburg Germany},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Chang, Di; Yin, Yufeng; Li, Zongjian; Tran, Minh; Soleymani, Mohammad
LibreFace: An Open-Source Toolkit for Deep Facial Expression Analysis Miscellaneous
2023, (arXiv:2308.10713 [cs]).
@misc{chang_libreface_2023,
title = {LibreFace: An Open-Source Toolkit for Deep Facial Expression Analysis},
author = {Di Chang and Yufeng Yin and Zongjian Li and Minh Tran and Mohammad Soleymani},
url = {http://arxiv.org/abs/2308.10713},
year = {2023},
date = {2023-08-01},
urldate = {2024-02-21},
publisher = {arXiv},
abstract = {Facial expression analysis is an important tool for human-computer interaction. In this paper, we introduce LibreFace, an open-source toolkit for facial expression analysis. This open-source toolbox offers real-time and offline analysis of facial behavior through deep learning models, including facial action unit (AU) detection, AU intensity estimation, and facial expression recognition. To accomplish this, we employ several techniques, including the utilization of a large-scale pre-trained network, feature-wise knowledge distillation, and task-specific fine-tuning. These approaches are designed to effectively and accurately analyze facial expressions by leveraging visual information, thereby facilitating the implementation of real-time interactive applications. In terms of Action Unit (AU) intensity estimation, we achieve a Pearson Correlation Coefficient (PCC) of 0.63 on DISFA, which is 7% higher than the performance of OpenFace 2.0 while maintaining highly-efficient inference that runs two times faster than OpenFace 2.0. Despite being compact, our model also demonstrates competitive performance to state-of-the-art facial expression analysis methods on AffecNet, FFHQ, and RAF-DB. Our code will be released at https://github.com/ihp-lab/LibreFace},
note = {arXiv:2308.10713 [cs]},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
Yin, Yufeng; Chang, Di; Song, Guoxian; Sang, Shen; Zhi, Tiancheng; Liu, Jing; Luo, Linjie; Soleymani, Mohammad
FG-Net: Facial Action Unit Detection with Generalizable Pyramidal Features Miscellaneous
2023, (arXiv:2308.12380 [cs]).
@misc{yin_fg-net_2023,
title = {FG-Net: Facial Action Unit Detection with Generalizable Pyramidal Features},
author = {Yufeng Yin and Di Chang and Guoxian Song and Shen Sang and Tiancheng Zhi and Jing Liu and Linjie Luo and Mohammad Soleymani},
url = {http://arxiv.org/abs/2308.12380},
year = {2023},
date = {2023-08-01},
urldate = {2024-02-21},
publisher = {arXiv},
abstract = {Automatic detection of facial Action Units (AUs) allows for objective facial expression analysis. Due to the high cost of AU labeling and the limited size of existing benchmarks, previous AU detection methods tend to overfit the dataset, resulting in a significant performance loss when evaluated across corpora. To address this problem, we propose FG-Net for generalizable facial action unit detection. Specifically, FG-Net extracts feature maps from a StyleGAN2 model pre-trained on a large and diverse face image dataset. Then, these features are used to detect AUs with a Pyramid CNN Interpreter, making the training efficient and capturing essential local features. The proposed FG-Net achieves a strong generalization ability for heatmap-based AU detection thanks to the generalizable and semantic-rich features extracted from the pre-trained generative model. Extensive experiments are conducted to evaluate within- and cross-corpus AU detection with the widely-used DISFA and BP4D datasets. Compared with the state-of-the-art, the proposed method achieves superior cross-domain performance while maintaining competitive within-domain performance. In addition, FG-Net is data-efficient and achieves competitive performance even when trained on 1000 samples. Our code will be released at textbackslashurlhttps://github.com/ihp-lab/FG-Net},
note = {arXiv:2308.12380 [cs]},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
Tran, Minh; Yin, Yufeng; Soleymani, Mohammad
Personalized Adaptation with Pre-trained Speech Encoders for Continuous Emotion Recognition Proceedings Article
In: INTERSPEECH 2023, pp. 636–640, ISCA, 2023.
@inproceedings{tran_personalized_2023,
title = {Personalized Adaptation with Pre-trained Speech Encoders for Continuous Emotion Recognition},
author = {Minh Tran and Yufeng Yin and Mohammad Soleymani},
url = {https://www.isca-speech.org/archive/interspeech_2023/tran23c_interspeech.html},
doi = {10.21437/Interspeech.2023-2170},
year = {2023},
date = {2023-08-01},
urldate = {2023-08-23},
booktitle = {INTERSPEECH 2023},
pages = {636–640},
publisher = {ISCA},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Tran, Minh; Soleymani, Mohammad
A Speech Representation Anonymization Framework via Selective Noise Perturbation Proceedings Article
In: ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 1–5, IEEE, Rhodes Island, Greece, 2023, ISBN: 978-1-72816-327-7.
@inproceedings{tran_speech_2023,
title = {A Speech Representation Anonymization Framework via Selective Noise Perturbation},
author = {Minh Tran and Mohammad Soleymani},
url = {https://ieeexplore.ieee.org/document/10095173/},
doi = {10.1109/ICASSP49357.2023.10095173},
isbn = {978-1-72816-327-7},
year = {2023},
date = {2023-06-01},
urldate = {2023-08-23},
booktitle = {ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
pages = {1–5},
publisher = {IEEE},
address = {Rhodes Island, Greece},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Yin, Yufeng; Xu, Jiashu; Zu, Tianxin; Soleymani, Mohammad
X-Norm: Exchanging Normalization Parameters for Bimodal Fusion Proceedings Article
In: INTERNATIONAL CONFERENCE ON MULTIMODAL INTERACTION, pp. 605–614, ACM, Bengaluru India, 2022, ISBN: 978-1-4503-9390-4.
@inproceedings{yin_x-norm_2022,
title = {X-Norm: Exchanging Normalization Parameters for Bimodal Fusion},
author = {Yufeng Yin and Jiashu Xu and Tianxin Zu and Mohammad Soleymani},
url = {https://dl.acm.org/doi/10.1145/3536221.3556581},
doi = {10.1145/3536221.3556581},
isbn = {978-1-4503-9390-4},
year = {2022},
date = {2022-11-01},
urldate = {2023-08-24},
booktitle = {INTERNATIONAL CONFERENCE ON MULTIMODAL INTERACTION},
pages = {605–614},
publisher = {ACM},
address = {Bengaluru India},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Filter
2024
Zhang, Hao; Chang, Di; Li, Fang; Soleymani, Mohammad; Ahuja, Narendra
MagicPose4D: Crafting Articulated Models with Appearance and Motion Control Miscellaneous
2024, (Version Number: 1).
Abstract | Links | BibTeX | Tags: VGL, Virtual Humans
@misc{zhang_magicpose4d_2024,
title = {MagicPose4D: Crafting Articulated Models with Appearance and Motion Control},
author = {Hao Zhang and Di Chang and Fang Li and Mohammad Soleymani and Narendra Ahuja},
url = {https://arxiv.org/abs/2405.14017},
doi = {10.48550/ARXIV.2405.14017},
year = {2024},
date = {2024-05-01},
urldate = {2024-06-25},
publisher = {arXiv},
abstract = {With the success of 2D and 3D visual generative models, there is growing interest in generating 4D content. Existing methods primarily rely on text prompts to produce 4D content, but they often fall short of accurately defining complex or rare motions. To address this limitation, we propose MagicPose4D, a novel framework for refined control over both appearance and motion in 4D generation. Unlike traditional methods, MagicPose4D accepts monocular videos as motion prompts, enabling precise and customizable motion generation. MagicPose4D comprises two key modules:
i) Dual-Phase 4D Reconstruction Modulevphantom which operates in two phases. The first phase focuses on capturing the model's shape using accurate 2D supervision and less accurate but geometrically informative 3D pseudo-supervision without imposing skeleton constraints. The second phase refines the model using more accurate pseudo-3D supervision, obtained in the first phase and introduces kinematic chain-based skeleton constraints to ensure physical plausibility. Additionally, we propose a Global-local Chamfer loss that aligns the overall distribution of predicted mesh vertices with the supervision while maintaining part-level alignment without extra annotations.
ii) Cross-category Motion Transfer Modulevphantom leverages the predictions from the 4D reconstruction module and uses a kinematic-chain-based skeleton to achieve cross-category motion transfer. It ensures smooth transitions between frames through dynamic rigidity, facilitating robust generalization without additional training.
Through extensive experiments, we demonstrate that MagicPose4D significantly improves the accuracy and consistency of 4D content generation, outperforming existing methods in various benchmarks.},
note = {Version Number: 1},
keywords = {VGL, Virtual Humans},
pubstate = {published},
tppubtype = {misc}
}
i) Dual-Phase 4D Reconstruction Modulevphantom which operates in two phases. The first phase focuses on capturing the model's shape using accurate 2D supervision and less accurate but geometrically informative 3D pseudo-supervision without imposing skeleton constraints. The second phase refines the model using more accurate pseudo-3D supervision, obtained in the first phase and introduces kinematic chain-based skeleton constraints to ensure physical plausibility. Additionally, we propose a Global-local Chamfer loss that aligns the overall distribution of predicted mesh vertices with the supervision while maintaining part-level alignment without extra annotations.
ii) Cross-category Motion Transfer Modulevphantom leverages the predictions from the 4D reconstruction module and uses a kinematic-chain-based skeleton to achieve cross-category motion transfer. It ensures smooth transitions between frames through dynamic rigidity, facilitating robust generalization without additional training.
Through extensive experiments, we demonstrate that MagicPose4D significantly improves the accuracy and consistency of 4D content generation, outperforming existing methods in various benchmarks.
Chang, Di; Shi, Yichun; Gao, Quankai; Fu, Jessica; Xu, Hongyi; Song, Guoxian; Yan, Qing; Zhu, Yizhe; Yang, Xiao; Soleymani, Mohammad
MagicPose: Realistic Human Poses and Facial Expressions Retargeting with Identity-aware Diffusion Miscellaneous
2024, (arXiv:2311.12052 [cs]).
Abstract | Links | BibTeX | Tags:
@misc{chang_magicpose_2024,
title = {MagicPose: Realistic Human Poses and Facial Expressions Retargeting with Identity-aware Diffusion},
author = {Di Chang and Yichun Shi and Quankai Gao and Jessica Fu and Hongyi Xu and Guoxian Song and Qing Yan and Yizhe Zhu and Xiao Yang and Mohammad Soleymani},
url = {http://arxiv.org/abs/2311.12052},
year = {2024},
date = {2024-05-01},
urldate = {2024-07-18},
publisher = {arXiv},
abstract = {In this work, we propose MagicPose, a diffusion-based model for 2D human pose and facial expression retargeting. Specifically, given a reference image, we aim to generate a person's new images by controlling the poses and facial expressions while keeping the identity unchanged. To this end, we propose a two-stage training strategy to disentangle human motions and appearance (e.g., facial expressions, skin tone and dressing), consisting of (1) the pre-training of an appearance-control block and (2) learning appearance-disentangled pose control. Our novel design enables robust appearance control over generated human images, including body, facial attributes, and even background. By leveraging the prior knowledge of image diffusion models, MagicPose generalizes well to unseen human identities and complex poses without the need for additional fine-tuning. Moreover, the proposed model is easy to use and can be considered as a plug-in module/extension to Stable Diffusion. The code is available at: https://github.com/Boese0601/MagicDance},
note = {arXiv:2311.12052 [cs]},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
Bohy, Hugo; Tran, Minh; Haddad, Kevin El; Dutoit, Thierry; Soleymani, Mohammad
Social-MAE: A Transformer-Based Multimodal Autoencoder for Face and Voice Proceedings Article
In: 2024 IEEE 18th International Conference on Automatic Face and Gesture Recognition (FG), pp. 1–5, IEEE, Istanbul, Turkiye, 2024, ISBN: 9798350394948.
@inproceedings{bohy_social-mae_2024,
title = {Social-MAE: A Transformer-Based Multimodal Autoencoder for Face and Voice},
author = {Hugo Bohy and Minh Tran and Kevin El Haddad and Thierry Dutoit and Mohammad Soleymani},
url = {https://ieeexplore.ieee.org/document/10581940/},
doi = {10.1109/FG59268.2024.10581940},
isbn = {9798350394948},
year = {2024},
date = {2024-05-01},
urldate = {2024-07-18},
booktitle = {2024 IEEE 18th International Conference on Automatic Face and Gesture Recognition (FG)},
pages = {1–5},
publisher = {IEEE},
address = {Istanbul, Turkiye},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Soleymani, Mohammad; Rahmani, Mehdi; Bigdeli, Nooshin
Robust Tube-Based Reference Tracking Nonlinear Model Predictive Control for Wind Turbines Journal Article
In: IEEE Trans. Automat. Sci. Eng., pp. 1–13, 2024, ISSN: 1545-5955, 1558-3783.
@article{soleymani_robust_2024,
title = {Robust Tube-Based Reference Tracking Nonlinear Model Predictive Control for Wind Turbines},
author = {Mohammad Soleymani and Mehdi Rahmani and Nooshin Bigdeli},
url = {https://ieeexplore.ieee.org/document/10495787/},
doi = {10.1109/TASE.2024.3385714},
issn = {1545-5955, 1558-3783},
year = {2024},
date = {2024-04-01},
urldate = {2024-04-16},
journal = {IEEE Trans. Automat. Sci. Eng.},
pages = {1–13},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Soleymani, Mohammad; Kumano, Shiro; Provost, Emily Mower; Bianchi-Berthouze, Nadia; Sano, Akane; Suzuki, Kenji
Guest Editorial Best of ACII 2021 Journal Article
In: IEEE Trans. Affective Comput., vol. 15, no. 2, pp. 376–379, 2024, ISSN: 1949-3045, 2371-9850.
@article{soleymani_guest_2024,
title = {Guest Editorial Best of ACII 2021},
author = {Mohammad Soleymani and Shiro Kumano and Emily Mower Provost and Nadia Bianchi-Berthouze and Akane Sano and Kenji Suzuki},
url = {https://ieeexplore.ieee.org/document/10542496/},
doi = {10.1109/TAFFC.2024.3389249},
issn = {1949-3045, 2371-9850},
year = {2024},
date = {2024-04-01},
urldate = {2024-06-25},
journal = {IEEE Trans. Affective Comput.},
volume = {15},
number = {2},
pages = {376–379},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Tran, Minh; Chang, Di; Siniukov, Maksim; Soleymani, Mohammad
Dyadic Interaction Modeling for Social Behavior Generation Miscellaneous
2024, (arXiv:2403.09069 [cs]).
Abstract | Links | BibTeX | Tags: DTIC, UARC, Virtual Humans
@misc{tran_dyadic_2024,
title = {Dyadic Interaction Modeling for Social Behavior Generation},
author = {Minh Tran and Di Chang and Maksim Siniukov and Mohammad Soleymani},
url = {http://arxiv.org/abs/2403.09069},
year = {2024},
date = {2024-03-01},
urldate = {2024-03-19},
publisher = {arXiv},
abstract = {Human-human communication is like a delicate dance where listeners and speakers concurrently interact to maintain conversational dynamics. Hence, an effective model for generating listener nonverbal behaviors requires understanding the dyadic context and interaction. In this paper, we present an effective framework for creating 3D facial motions in dyadic interactions. Existing work consider a listener as a reactive agent with reflexive behaviors to the speaker's voice and facial motions. The heart of our framework is Dyadic Interaction Modeling (DIM), a pre-training approach that jointly models speakers' and listeners' motions through masking and contrastive learning to learn representations that capture the dyadic context. To enable the generation of non-deterministic behaviors, we encode both listener and speaker motions into discrete latent representations, through VQ-VAE. The pre-trained model is further fine-tuned for motion generation. Extensive experiments demonstrate the superiority of our framework in generating listener motions, establishing a new state-of-the-art according to the quantitative measures capturing the diversity and realism of generated motions. Qualitative results demonstrate the superior capabilities of the proposed approach in generating diverse and realistic expressions, eye blinks and head gestures.},
note = {arXiv:2403.09069 [cs]},
keywords = {DTIC, UARC, Virtual Humans},
pubstate = {published},
tppubtype = {misc}
}
Lu, Liupei; Yin, Yufeng; Gu, Yuming; Wu, Yizhen; Prasad, Pratusha; Zhao, Yajie; Soleymani, Mohammad
Leveraging Synthetic Data for Generalizable and Fair Facial Action Unit Detection Miscellaneous
2024, (arXiv:2403.10737 [cs]).
Abstract | Links | BibTeX | Tags: DTIC, UARC, Virtual Humans
@misc{lu_leveraging_2024,
title = {Leveraging Synthetic Data for Generalizable and Fair Facial Action Unit Detection},
author = {Liupei Lu and Yufeng Yin and Yuming Gu and Yizhen Wu and Pratusha Prasad and Yajie Zhao and Mohammad Soleymani},
url = {http://arxiv.org/abs/2403.10737},
year = {2024},
date = {2024-03-01},
urldate = {2024-04-16},
publisher = {arXiv},
abstract = {Facial action unit (AU) detection is a fundamental block for objective facial expression analysis. Supervised learning approaches require a large amount of manual labeling which is costly. The limited labeled data are also not diverse in terms of gender which can affect model fairness. In this paper, we propose to use synthetically generated data and multi-source domain adaptation (MSDA) to address the problems of the scarcity of labeled data and the diversity of subjects. Specifically, we propose to generate a diverse dataset through synthetic facial expression re-targeting by transferring the expressions from real faces to synthetic avatars. Then, we use MSDA to transfer the AU detection knowledge from a real dataset and the synthetic dataset to a target dataset. Instead of aligning the overall distributions of different domains, we propose Paired Moment Matching (PM2) to align the features of the paired real and synthetic data with the same facial expression. To further improve gender fairness, PM2 matches the features of the real data with a female and a male synthetic image. Our results indicate that synthetic data and the proposed model improve both AU detection performance and fairness across genders, demonstrating its potential to solve AU detection in-the-wild.},
note = {arXiv:2403.10737 [cs]},
keywords = {DTIC, UARC, Virtual Humans},
pubstate = {published},
tppubtype = {misc}
}
Shi, Zhonghao; O'Connell, Allison; Li, Zongjian; Liu, Siqi; Ayissi, Jennifer; Hoffman, Guy; Soleymani, Mohammad; Matarić, Maja J.
Build Your Own Robot Friend: An Open-Source Learning Module for Accessible and Engaging AI Education Miscellaneous
2024, (arXiv:2402.01647 [cs]).
Abstract | Links | BibTeX | Tags: Virtual Humans
@misc{shi_build_2024,
title = {Build Your Own Robot Friend: An Open-Source Learning Module for Accessible and Engaging AI Education},
author = {Zhonghao Shi and Allison O'Connell and Zongjian Li and Siqi Liu and Jennifer Ayissi and Guy Hoffman and Mohammad Soleymani and Maja J. Matarić},
url = {http://arxiv.org/abs/2402.01647},
year = {2024},
date = {2024-01-01},
urldate = {2024-02-21},
publisher = {arXiv},
abstract = {As artificial intelligence (AI) is playing an increasingly important role in our society and global economy, AI education and literacy have become necessary components in college and K-12 education to prepare students for an AI-powered society. However, current AI curricula have not yet been made accessible and engaging enough for students and schools from all socio-economic backgrounds with different educational goals. In this work, we developed an open-source learning module for college and high school students, which allows students to build their own robot companion from the ground up. This open platform can be used to provide hands-on experience and introductory knowledge about various aspects of AI, including robotics, machine learning (ML), software engineering, and mechanical engineering. Because of the social and personal nature of a socially assistive robot companion, this module also puts a special emphasis on human-centered AI, enabling students to develop a better understanding of human-AI interaction and AI ethics through hands-on learning activities. With open-source documentation, assembling manuals and affordable materials, students from different socio-economic backgrounds can personalize their learning experience based on their individual educational goals. To evaluate the student-perceived quality of our module, we conducted a usability testing workshop with 15 college students recruited from a minority-serving institution. Our results indicate that our AI module is effective, easy-to-follow, and engaging, and it increases student interest in studying AI/ML and robotics in the future. We hope that this work will contribute toward accessible and engaging AI education in human-AI interaction for college and high school students.},
note = {arXiv:2402.01647 [cs]},
keywords = {Virtual Humans},
pubstate = {published},
tppubtype = {misc}
}
2023
Zhou, Emily; Soleymani, Mohammad; Matarić, Maja J.
Investigating the Generalizability of Physiological Characteristics of Anxiety Proceedings Article
In: 2023 IEEE International Conference on Bioinformatics and Biomedicine (BIBM), pp. 4848–4855, IEEE, Istanbul, Turkiye, 2023, ISBN: 9798350337488.
Links | BibTeX | Tags: Machine Learning
@inproceedings{zhou_investigating_2023,
title = {Investigating the Generalizability of Physiological Characteristics of Anxiety},
author = {Emily Zhou and Mohammad Soleymani and Maja J. Matarić},
url = {https://ieeexplore.ieee.org/document/10385292/},
doi = {10.1109/BIBM58861.2023.10385292},
isbn = {9798350337488},
year = {2023},
date = {2023-12-01},
urldate = {2024-04-16},
booktitle = {2023 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)},
pages = {4848–4855},
publisher = {IEEE},
address = {Istanbul, Turkiye},
keywords = {Machine Learning},
pubstate = {published},
tppubtype = {inproceedings}
}
Chang, Di; Shi, Yichun; Gao, Quankai; Fu, Jessica; Xu, Hongyi; Song, Guoxian; Yan, Qing; Yang, Xiao; Soleymani, Mohammad
MagicDance: Realistic Human Dance Video Generation with Motions & Facial Expressions Transfer Miscellaneous
2023, (arXiv:2311.12052 [cs]).
Abstract | Links | BibTeX | Tags: DTIC, UARC, Virtual Humans
@misc{chang_magicdance_2023,
title = {MagicDance: Realistic Human Dance Video Generation with Motions & Facial Expressions Transfer},
author = {Di Chang and Yichun Shi and Quankai Gao and Jessica Fu and Hongyi Xu and Guoxian Song and Qing Yan and Xiao Yang and Mohammad Soleymani},
url = {http://arxiv.org/abs/2311.12052},
year = {2023},
date = {2023-11-01},
urldate = {2023-12-07},
publisher = {arXiv},
abstract = {In this work, we propose MagicDance, a diffusion-based model for 2D human motion and facial expression transfer on challenging human dance videos. Specifically, we aim to generate human dance videos of any target identity driven by novel pose sequences while keeping the identity unchanged. To this end, we propose a two-stage training strategy to disentangle human motions and appearance (e.g., facial expressions, skin tone and dressing), consisting of the pretraining of an appearance-control block and fine-tuning of an appearance-pose-joint-control block over human dance poses of the same dataset. Our novel design enables robust appearance control with temporally consistent upper body, facial attributes, and even background. The model also generalizes well on unseen human identities and complex motion sequences without the need for any fine-tuning with additional data with diverse human attributes by leveraging the prior knowledge of image diffusion models. Moreover, the proposed model is easy to use and can be considered as a plug-in module/extension to Stable Diffusion. We also demonstrate the model's ability for zero-shot 2D animation generation, enabling not only the appearance transfer from one identity to another but also allowing for cartoon-like stylization given only pose inputs. Extensive experiments demonstrate our superior performance on the TikTok dataset.},
note = {arXiv:2311.12052 [cs]},
keywords = {DTIC, UARC, Virtual Humans},
pubstate = {published},
tppubtype = {misc}
}
Tran, Minh; Soleymani, Mohammad
Privacy-preserving Representation Learning for Speech Understanding Miscellaneous
2023, (arXiv:2310.17194 [eess]).
Abstract | Links | BibTeX | Tags: UARC, Virtual Humans
@misc{tran_privacy-preserving_2023,
title = {Privacy-preserving Representation Learning for Speech Understanding},
author = {Minh Tran and Mohammad Soleymani},
url = {http://arxiv.org/abs/2310.17194},
year = {2023},
date = {2023-10-01},
urldate = {2023-12-07},
publisher = {arXiv},
abstract = {Existing privacy-preserving speech representation learning methods target a single application domain. In this paper, we present a novel framework to anonymize utterance-level speech embeddings generated by pre-trained encoders and show its effectiveness for a range of speech classification tasks. Specifically, given the representations from a pre-trained encoder, we train a Transformer to estimate the representations for the same utterances spoken by other speakers. During inference, the extracted representations can be converted into different identities to preserve privacy. We compare the results with the voice anonymization baselines from the VoicePrivacy 2022 challenge. We evaluate our framework on speaker identification for privacy and emotion recognition, depression classification, and intent classification for utility. Our method outperforms the baselines on privacy and utility in paralinguistic tasks and achieves comparable performance for intent classification.},
note = {arXiv:2310.17194 [eess]},
keywords = {UARC, Virtual Humans},
pubstate = {published},
tppubtype = {misc}
}
Ahmed, Tamim; Rikakis, Thanassis; Kelliher, Aisling; Soleymani, Mohammad
ASAR Dataset and Computational Model for Affective State Recognition During ARAT Assessment for Upper Extremity Stroke Survivors Proceedings Article
In: International Cconference on Multimodal Interaction, pp. 11–15, ACM, Paris France, 2023, ISBN: 9798400703218.
Links | BibTeX | Tags: DTIC, UARC, Virtual Humans
@inproceedings{ahmed_asar_2023,
title = {ASAR Dataset and Computational Model for Affective State Recognition During ARAT Assessment for Upper Extremity Stroke Survivors},
author = {Tamim Ahmed and Thanassis Rikakis and Aisling Kelliher and Mohammad Soleymani},
url = {https://dl.acm.org/doi/10.1145/3610661.3617154},
doi = {10.1145/3610661.3617154},
isbn = {9798400703218},
year = {2023},
date = {2023-10-01},
urldate = {2023-12-07},
booktitle = {International Cconference on Multimodal Interaction},
pages = {11–15},
publisher = {ACM},
address = {Paris France},
keywords = {DTIC, UARC, Virtual Humans},
pubstate = {published},
tppubtype = {inproceedings}
}
Andrist, Sean; Bohus, Dan; Li, Zongjian; Soleymani, Mohammad
Platform for Situated Intelligence and OpenSense: A Tutorial on Building Multimodal Interactive Applications for Research Proceedings Article
In: International Cconference on Multimodal Interaction, pp. 105–106, ACM, Paris France, 2023, ISBN: 9798400703218.
Links | BibTeX | Tags: AI, UARC, Virtual Humans
@inproceedings{andrist_platform_2023,
title = {Platform for Situated Intelligence and OpenSense: A Tutorial on Building Multimodal Interactive Applications for Research},
author = {Sean Andrist and Dan Bohus and Zongjian Li and Mohammad Soleymani},
url = {https://dl.acm.org/doi/10.1145/3610661.3617603},
doi = {10.1145/3610661.3617603},
isbn = {9798400703218},
year = {2023},
date = {2023-10-01},
urldate = {2023-12-07},
booktitle = {International Cconference on Multimodal Interaction},
pages = {105–106},
publisher = {ACM},
address = {Paris France},
keywords = {AI, UARC, Virtual Humans},
pubstate = {published},
tppubtype = {inproceedings}
}
Tran, Trang; Yin, Yufeng; Tavabi, Leili; Delacruz, Joannalyn; Borsari, Brian; Woolley, Joshua D; Scherer, Stefan; Soleymani, Mohammad
Multimodal Analysis and Assessment of Therapist Empathy in Motivational Interviews Proceedings Article
In: INTERNATIONAL CONFERENCE ON MULTIMODAL INTERACTION, pp. 406–415, ACM, Paris France, 2023, ISBN: 9798400700552.
Links | BibTeX | Tags: DTIC, UARC, Virtual Humans
@inproceedings{tran_multimodal_2023,
title = {Multimodal Analysis and Assessment of Therapist Empathy in Motivational Interviews},
author = {Trang Tran and Yufeng Yin and Leili Tavabi and Joannalyn Delacruz and Brian Borsari and Joshua D Woolley and Stefan Scherer and Mohammad Soleymani},
url = {https://dl.acm.org/doi/10.1145/3577190.3614105},
doi = {10.1145/3577190.3614105},
isbn = {9798400700552},
year = {2023},
date = {2023-10-01},
urldate = {2023-12-07},
booktitle = {INTERNATIONAL CONFERENCE ON MULTIMODAL INTERACTION},
pages = {406–415},
publisher = {ACM},
address = {Paris France},
keywords = {DTIC, UARC, Virtual Humans},
pubstate = {published},
tppubtype = {inproceedings}
}
Gainer, Alesia; Aptaker, Allison; Artstein, Ron; Cobbins, David; Core, Mark; Gordon, Carla; Leuski, Anton; Li, Zongjian; Merchant, Chirag; Nelson, David; Soleymani, Mohammad; Traum, David
DIVIS: Digital Interactive Victim Intake Simulator Proceedings Article
In: Proceedings of the 23rd ACM International Conference on Intelligent Virtual Agents, pp. 1–2, ACM, Würzburg Germany, 2023, ISBN: 978-1-4503-9994-4.
Links | BibTeX | Tags: DTIC, MxR, UARC, Virtual Humans
@inproceedings{gainer_divis_2023,
title = {DIVIS: Digital Interactive Victim Intake Simulator},
author = {Alesia Gainer and Allison Aptaker and Ron Artstein and David Cobbins and Mark Core and Carla Gordon and Anton Leuski and Zongjian Li and Chirag Merchant and David Nelson and Mohammad Soleymani and David Traum},
url = {https://dl.acm.org/doi/10.1145/3570945.3607328},
doi = {10.1145/3570945.3607328},
isbn = {978-1-4503-9994-4},
year = {2023},
date = {2023-09-01},
urldate = {2024-02-20},
booktitle = {Proceedings of the 23rd ACM International Conference on Intelligent Virtual Agents},
pages = {1–2},
publisher = {ACM},
address = {Würzburg Germany},
keywords = {DTIC, MxR, UARC, Virtual Humans},
pubstate = {published},
tppubtype = {inproceedings}
}
Chang, Di; Yin, Yufeng; Li, Zongjian; Tran, Minh; Soleymani, Mohammad
LibreFace: An Open-Source Toolkit for Deep Facial Expression Analysis Miscellaneous
2023, (arXiv:2308.10713 [cs]).
Abstract | Links | BibTeX | Tags: DTIC, Virtual Humans
@misc{chang_libreface_2023,
title = {LibreFace: An Open-Source Toolkit for Deep Facial Expression Analysis},
author = {Di Chang and Yufeng Yin and Zongjian Li and Minh Tran and Mohammad Soleymani},
url = {http://arxiv.org/abs/2308.10713},
year = {2023},
date = {2023-08-01},
urldate = {2024-02-21},
publisher = {arXiv},
abstract = {Facial expression analysis is an important tool for human-computer interaction. In this paper, we introduce LibreFace, an open-source toolkit for facial expression analysis. This open-source toolbox offers real-time and offline analysis of facial behavior through deep learning models, including facial action unit (AU) detection, AU intensity estimation, and facial expression recognition. To accomplish this, we employ several techniques, including the utilization of a large-scale pre-trained network, feature-wise knowledge distillation, and task-specific fine-tuning. These approaches are designed to effectively and accurately analyze facial expressions by leveraging visual information, thereby facilitating the implementation of real-time interactive applications. In terms of Action Unit (AU) intensity estimation, we achieve a Pearson Correlation Coefficient (PCC) of 0.63 on DISFA, which is 7% higher than the performance of OpenFace 2.0 while maintaining highly-efficient inference that runs two times faster than OpenFace 2.0. Despite being compact, our model also demonstrates competitive performance to state-of-the-art facial expression analysis methods on AffecNet, FFHQ, and RAF-DB. Our code will be released at https://github.com/ihp-lab/LibreFace},
note = {arXiv:2308.10713 [cs]},
keywords = {DTIC, Virtual Humans},
pubstate = {published},
tppubtype = {misc}
}
Yin, Yufeng; Chang, Di; Song, Guoxian; Sang, Shen; Zhi, Tiancheng; Liu, Jing; Luo, Linjie; Soleymani, Mohammad
FG-Net: Facial Action Unit Detection with Generalizable Pyramidal Features Miscellaneous
2023, (arXiv:2308.12380 [cs]).
Abstract | Links | BibTeX | Tags: DTIC, Virtual Humans
@misc{yin_fg-net_2023,
title = {FG-Net: Facial Action Unit Detection with Generalizable Pyramidal Features},
author = {Yufeng Yin and Di Chang and Guoxian Song and Shen Sang and Tiancheng Zhi and Jing Liu and Linjie Luo and Mohammad Soleymani},
url = {http://arxiv.org/abs/2308.12380},
year = {2023},
date = {2023-08-01},
urldate = {2024-02-21},
publisher = {arXiv},
abstract = {Automatic detection of facial Action Units (AUs) allows for objective facial expression analysis. Due to the high cost of AU labeling and the limited size of existing benchmarks, previous AU detection methods tend to overfit the dataset, resulting in a significant performance loss when evaluated across corpora. To address this problem, we propose FG-Net for generalizable facial action unit detection. Specifically, FG-Net extracts feature maps from a StyleGAN2 model pre-trained on a large and diverse face image dataset. Then, these features are used to detect AUs with a Pyramid CNN Interpreter, making the training efficient and capturing essential local features. The proposed FG-Net achieves a strong generalization ability for heatmap-based AU detection thanks to the generalizable and semantic-rich features extracted from the pre-trained generative model. Extensive experiments are conducted to evaluate within- and cross-corpus AU detection with the widely-used DISFA and BP4D datasets. Compared with the state-of-the-art, the proposed method achieves superior cross-domain performance while maintaining competitive within-domain performance. In addition, FG-Net is data-efficient and achieves competitive performance even when trained on 1000 samples. Our code will be released at textbackslashurlhttps://github.com/ihp-lab/FG-Net},
note = {arXiv:2308.12380 [cs]},
keywords = {DTIC, Virtual Humans},
pubstate = {published},
tppubtype = {misc}
}
Tran, Minh; Yin, Yufeng; Soleymani, Mohammad
Personalized Adaptation with Pre-trained Speech Encoders for Continuous Emotion Recognition Proceedings Article
In: INTERSPEECH 2023, pp. 636–640, ISCA, 2023.
Links | BibTeX | Tags: DTIC, Emotions, UARC, Virtual Humans
@inproceedings{tran_personalized_2023,
title = {Personalized Adaptation with Pre-trained Speech Encoders for Continuous Emotion Recognition},
author = {Minh Tran and Yufeng Yin and Mohammad Soleymani},
url = {https://www.isca-speech.org/archive/interspeech_2023/tran23c_interspeech.html},
doi = {10.21437/Interspeech.2023-2170},
year = {2023},
date = {2023-08-01},
urldate = {2023-08-23},
booktitle = {INTERSPEECH 2023},
pages = {636–640},
publisher = {ISCA},
keywords = {DTIC, Emotions, UARC, Virtual Humans},
pubstate = {published},
tppubtype = {inproceedings}
}
Tran, Minh; Soleymani, Mohammad
A Speech Representation Anonymization Framework via Selective Noise Perturbation Proceedings Article
In: ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 1–5, IEEE, Rhodes Island, Greece, 2023, ISBN: 978-1-72816-327-7.
Links | BibTeX | Tags: DTIC, UARC, Virtual Humans
@inproceedings{tran_speech_2023,
title = {A Speech Representation Anonymization Framework via Selective Noise Perturbation},
author = {Minh Tran and Mohammad Soleymani},
url = {https://ieeexplore.ieee.org/document/10095173/},
doi = {10.1109/ICASSP49357.2023.10095173},
isbn = {978-1-72816-327-7},
year = {2023},
date = {2023-06-01},
urldate = {2023-08-23},
booktitle = {ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
pages = {1–5},
publisher = {IEEE},
address = {Rhodes Island, Greece},
keywords = {DTIC, UARC, Virtual Humans},
pubstate = {published},
tppubtype = {inproceedings}
}
2022
Yin, Yufeng; Xu, Jiashu; Zu, Tianxin; Soleymani, Mohammad
X-Norm: Exchanging Normalization Parameters for Bimodal Fusion Proceedings Article
In: INTERNATIONAL CONFERENCE ON MULTIMODAL INTERACTION, pp. 605–614, ACM, Bengaluru India, 2022, ISBN: 978-1-4503-9390-4.
Links | BibTeX | Tags: DTIC, Emotions, Virtual Humans
@inproceedings{yin_x-norm_2022,
title = {X-Norm: Exchanging Normalization Parameters for Bimodal Fusion},
author = {Yufeng Yin and Jiashu Xu and Tianxin Zu and Mohammad Soleymani},
url = {https://dl.acm.org/doi/10.1145/3536221.3556581},
doi = {10.1145/3536221.3556581},
isbn = {978-1-4503-9390-4},
year = {2022},
date = {2022-11-01},
urldate = {2023-08-24},
booktitle = {INTERNATIONAL CONFERENCE ON MULTIMODAL INTERACTION},
pages = {605–614},
publisher = {ACM},
address = {Bengaluru India},
keywords = {DTIC, Emotions, Virtual Humans},
pubstate = {published},
tppubtype = {inproceedings}
}
Zhang, Larry; Kolacz, Jacek; Rizzo, Albert; Scherer, Stefan; Soleymani, Mohammad
Speech Behavioral Markers Align on Symptom Factors in Psychological Distress Proceedings Article
In: 2022 10th International Conference on Affective Computing and Intelligent Interaction (ACII), pp. 1–8, 2022, (ISSN: 2156-8111).
Abstract | Links | BibTeX | Tags: DTIC, MedVR, UARC
@inproceedings{zhang_speech_2022,
title = {Speech Behavioral Markers Align on Symptom Factors in Psychological Distress},
author = {Larry Zhang and Jacek Kolacz and Albert Rizzo and Stefan Scherer and Mohammad Soleymani},
url = {https://ieeexplore.ieee.org/abstract/document/9953849},
doi = {10.1109/ACII55700.2022.9953849},
year = {2022},
date = {2022-10-01},
booktitle = {2022 10th International Conference on Affective Computing and Intelligent Interaction (ACII)},
pages = {1–8},
abstract = {Automatic detection of psychological disorders has gained significant attention in recent years due to the rise in their prevalence. However, the majority of studies have overlooked the complexity of disorders in favor of a “present/not present” dichotomy in representing disorders. Recent psychological research challenges favors transdiagnostic approaches, moving beyond general disorder classifications to symptom level analysis, as symptoms are often not exclusive to individual disorder classes. In our study, we investigated the link between speech signals and psychological distress symptoms in a corpus of 333 screening interviews from the Distress Analysis Interview Corpus (DAIC). Given the semi-structured organization of interviews, we aggregated speech utterances from responses to shared questions across interviews. We employed deterministic sample selection in classification to rank salient questions for eliciting symptom-specific behaviors in order to predict symptom presence. Some questions include “Do you find therapy helpful?” and “When was the last time you felt happy?”. The prediction results align closely to the factor structure of psychological distress symptoms, linking speech behaviors primarily to somatic and affective alterations in both depression and PTSD. This lends support for the transdiagnostic validity of speech markers for detecting such symptoms. Surprisingly, we did not find a strong link between speech markers and cognitive or psychomotor alterations. This is surprising, given the complexity of motor and cognitive actions required in speech production. The results of our analysis highlight the importance of aligning affective computing research with psychological research to investigate the use of automatic behavioral sensing to assess psychiatric risk.},
note = {ISSN: 2156-8111},
keywords = {DTIC, MedVR, UARC},
pubstate = {published},
tppubtype = {inproceedings}
}
Tran, Minh; Soleymani, Mohammad
A Pre-Trained Audio-Visual Transformer for Emotion Recognition Proceedings Article
In: ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4698–4702, IEEE, Singapore, Singapore, 2022, ISBN: 978-1-66540-540-9.
Links | BibTeX | Tags: DTIC, Emotions, Virtual Humans
@inproceedings{tran_pre-trained_2022,
title = {A Pre-Trained Audio-Visual Transformer for Emotion Recognition},
author = {Minh Tran and Mohammad Soleymani},
url = {https://ieeexplore.ieee.org/document/9747278/},
doi = {10.1109/ICASSP43922.2022.9747278},
isbn = {978-1-66540-540-9},
year = {2022},
date = {2022-05-01},
urldate = {2022-09-23},
booktitle = {ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
pages = {4698–4702},
publisher = {IEEE},
address = {Singapore, Singapore},
keywords = {DTIC, Emotions, Virtual Humans},
pubstate = {published},
tppubtype = {inproceedings}
}
Zhu, Haidong; Zheng, Zhaoheng; Soleymani, Mohammad; Nevatia, Ram
Self-Supervised Learning for Sentiment Analysis via Image-Text Matching Proceedings Article
In: ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 1710–1714, IEEE, Singapore, Singapore, 2022, ISBN: 978-1-66540-540-9.
Links | BibTeX | Tags: Emotions
@inproceedings{zhu_self-supervised_2022,
title = {Self-Supervised Learning for Sentiment Analysis via Image-Text Matching},
author = {Haidong Zhu and Zhaoheng Zheng and Mohammad Soleymani and Ram Nevatia},
url = {https://ieeexplore.ieee.org/document/9747819/},
doi = {10.1109/ICASSP43922.2022.9747819},
isbn = {978-1-66540-540-9},
year = {2022},
date = {2022-05-01},
urldate = {2022-09-23},
booktitle = {ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
pages = {1710–1714},
publisher = {IEEE},
address = {Singapore, Singapore},
keywords = {Emotions},
pubstate = {published},
tppubtype = {inproceedings}
}
2021
Tran, Minh; Bradley, Ellen; Matvey, Michelle; Woolley, Joshua; Soleymani, Mohammad
Modeling Dynamics of Facial Behavior for Mental Health Assessment Proceedings Article
In: 2021 16th IEEE International Conference on Automatic Face and Gesture Recognition (FG 2021), pp. 1–5, IEEE, Jodhpur, India, 2021, ISBN: 978-1-66543-176-7.
Links | BibTeX | Tags: DTIC, Emotions, Virtual Humans
@inproceedings{tran_modeling_2021,
title = {Modeling Dynamics of Facial Behavior for Mental Health Assessment},
author = {Minh Tran and Ellen Bradley and Michelle Matvey and Joshua Woolley and Mohammad Soleymani},
url = {https://ieeexplore.ieee.org/document/9666955/},
doi = {10.1109/FG52635.2021.9666955},
isbn = {978-1-66543-176-7},
year = {2021},
date = {2021-12-01},
urldate = {2022-09-23},
booktitle = {2021 16th IEEE International Conference on Automatic Face and Gesture Recognition (FG 2021)},
pages = {1–5},
publisher = {IEEE},
address = {Jodhpur, India},
keywords = {DTIC, Emotions, Virtual Humans},
pubstate = {published},
tppubtype = {inproceedings}
}
Yin, Yufeng; Lu, Liupei; Wu, Yizhen; Soleymani, Mohammad
Self-Supervised Patch Localization for Cross-Domain Facial Action Unit Detection Proceedings Article
In: 2021 16th IEEE International Conference on Automatic Face and Gesture Recognition (FG 2021), pp. 1–8, IEEE, Jodhpur, India, 2021, ISBN: 978-1-66543-176-7.
Links | BibTeX | Tags: DTIC, Emotions, Virtual Humans
@inproceedings{yin_self-supervised_2021,
title = {Self-Supervised Patch Localization for Cross-Domain Facial Action Unit Detection},
author = {Yufeng Yin and Liupei Lu and Yizhen Wu and Mohammad Soleymani},
url = {https://ieeexplore.ieee.org/document/9667048/},
doi = {10.1109/FG52635.2021.9667048},
isbn = {978-1-66543-176-7},
year = {2021},
date = {2021-12-01},
urldate = {2022-09-23},
booktitle = {2021 16th IEEE International Conference on Automatic Face and Gesture Recognition (FG 2021)},
pages = {1–8},
publisher = {IEEE},
address = {Jodhpur, India},
keywords = {DTIC, Emotions, Virtual Humans},
pubstate = {published},
tppubtype = {inproceedings}
}
Kontogiorgos, Dimosthenis; Tran, Minh; Gustafson, Joakim; Soleymani, Mohammad
A Systematic Cross-Corpus Analysis of Human Reactions to Robot Conversational Failures Proceedings Article
In: Proceedings of the 2021 International Conference on Multimodal Interaction, pp. 112–120, ACM, Montréal QC Canada, 2021, ISBN: 978-1-4503-8481-0.
Links | BibTeX | Tags: DTIC, Emotions, UARC, Virtual Humans
@inproceedings{kontogiorgos_systematic_2021,
title = {A Systematic Cross-Corpus Analysis of Human Reactions to Robot Conversational Failures},
author = {Dimosthenis Kontogiorgos and Minh Tran and Joakim Gustafson and Mohammad Soleymani},
url = {https://dl.acm.org/doi/10.1145/3462244.3479887},
doi = {10.1145/3462244.3479887},
isbn = {978-1-4503-8481-0},
year = {2021},
date = {2021-10-01},
urldate = {2022-09-23},
booktitle = {Proceedings of the 2021 International Conference on Multimodal Interaction},
pages = {112–120},
publisher = {ACM},
address = {Montréal QC Canada},
keywords = {DTIC, Emotions, UARC, Virtual Humans},
pubstate = {published},
tppubtype = {inproceedings}
}
Yin, Yufeng; Lu, Liupei; Xiao, Yao; Xu, Zhi; Cai, Kaijie; Jiang, Haonan; Gratch, Jonathan; Soleymani, Mohammad
Contrastive Learning for Domain Transfer in Cross-Corpus Emotion Recognition Proceedings Article
In: 2021 9th International Conference on Affective Computing and Intelligent Interaction (ACII), pp. 1–8, IEEE, Nara, Japan, 2021, ISBN: 978-1-66540-019-0.
Links | BibTeX | Tags: DTIC, Emotions, Virtual Humans
@inproceedings{yin_contrastive_2021,
title = {Contrastive Learning for Domain Transfer in Cross-Corpus Emotion Recognition},
author = {Yufeng Yin and Liupei Lu and Yao Xiao and Zhi Xu and Kaijie Cai and Haonan Jiang and Jonathan Gratch and Mohammad Soleymani},
url = {https://ieeexplore.ieee.org/document/9597453/},
doi = {10.1109/ACII52823.2021.9597453},
isbn = {978-1-66540-019-0},
year = {2021},
date = {2021-09-01},
urldate = {2022-09-23},
booktitle = {2021 9th International Conference on Affective Computing and Intelligent Interaction (ACII)},
pages = {1–8},
publisher = {IEEE},
address = {Nara, Japan},
keywords = {DTIC, Emotions, Virtual Humans},
pubstate = {published},
tppubtype = {inproceedings}
}
He, Zihao; Tavabi, Leili; Lerman, Kristina; Soleymani, Mohammad
Speaker Turn Modeling for Dialogue Act Classification Proceedings Article
In: Findings of the Association for Computational Linguistics: EMNLP 2021, pp. 2150–2157, Association for Computational Linguistics, Punta Cana, Dominican Republic, 2021.
Links | BibTeX | Tags: Dialogue, DTIC, UARC
@inproceedings{he_speaker_2021,
title = {Speaker Turn Modeling for Dialogue Act Classification},
author = {Zihao He and Leili Tavabi and Kristina Lerman and Mohammad Soleymani},
url = {https://aclanthology.org/2021.findings-emnlp.185},
doi = {10.18653/v1/2021.findings-emnlp.185},
year = {2021},
date = {2021-01-01},
urldate = {2022-09-23},
booktitle = {Findings of the Association for Computational Linguistics: EMNLP 2021},
pages = {2150–2157},
publisher = {Association for Computational Linguistics},
address = {Punta Cana, Dominican Republic},
keywords = {Dialogue, DTIC, UARC},
pubstate = {published},
tppubtype = {inproceedings}
}
Cheng, Junyan; Fostiropoulos, Iordanis; Boehm, Barry; Soleymani, Mohammad
Multimodal Phased Transformer for Sentiment Analysis Proceedings Article
In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, pp. 2447–2458, Association for Computational Linguistics, Online and Punta Cana, Dominican Republic, 2021.
Links | BibTeX | Tags: DTIC, UARC
@inproceedings{cheng_multimodal_2021,
title = {Multimodal Phased Transformer for Sentiment Analysis},
author = {Junyan Cheng and Iordanis Fostiropoulos and Barry Boehm and Mohammad Soleymani},
url = {https://aclanthology.org/2021.emnlp-main.189},
doi = {10.18653/v1/2021.emnlp-main.189},
year = {2021},
date = {2021-01-01},
urldate = {2022-09-23},
booktitle = {Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing},
pages = {2447–2458},
publisher = {Association for Computational Linguistics},
address = {Online and Punta Cana, Dominican Republic},
keywords = {DTIC, UARC},
pubstate = {published},
tppubtype = {inproceedings}
}
2020
Rayatdoost, Soheil; Rudrauf, David; Soleymani, Mohammad
Expression-Guided EEG Representation Learning for Emotion Recognition Proceedings Article
In: Proceedings of the ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 3222–3226, IEEE, Barcelona, Spain, 2020, ISBN: 978-1-5090-6631-5.
Abstract | Links | BibTeX | Tags: UARC, Virtual Humans
@inproceedings{rayatdoost_expression-guided_2020,
title = {Expression-Guided EEG Representation Learning for Emotion Recognition},
author = {Soheil Rayatdoost and David Rudrauf and Mohammad Soleymani},
url = {https://ieeexplore.ieee.org/document/9053004/},
doi = {10.1109/ICASSP40776.2020.9053004},
isbn = {978-1-5090-6631-5},
year = {2020},
date = {2020-05-01},
booktitle = {Proceedings of the ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
pages = {3222–3226},
publisher = {IEEE},
address = {Barcelona, Spain},
abstract = {Learning a joint and coordinated representation between different modalities can improve multimodal emotion recognition. In this paper, we propose a deep representation learning approach for emotion recognition from electroencephalogram (EEG) signals guided by facial electromyogram (EMG) and electrooculogram (EOG) signals. We recorded EEG, EMG and EOG signals from 60 participants who watched 40 short videos and self-reported their emotions. A cross-modal encoder that jointly learns the features extracted from facial and ocular expressions and EEG responses was designed and evaluated on our recorded data and MAHOB-HCI, a publicly available database. We demonstrate that the proposed representation is able to improve emotion recognition performance. We also show that the learned representation can be transferred to a different database without EMG and EOG and achieve superior performance. Methods that fuse behavioral and neural responses can be deployed in wearable emotion recognition solutions, practical in situations in which computer vision expression recognition is not feasible.},
keywords = {UARC, Virtual Humans},
pubstate = {published},
tppubtype = {inproceedings}
}
Zhao, Sicheng; Wang, Shangfei; Soleymani, Mohammad; Joshi, Dhiraj; Ji, Qiang
Affective Computing for Large-scale Heterogeneous Multimedia Data: A Survey Journal Article
In: ACM Transactions on Multimedia Computing, Communications, and Applications, vol. 15, no. 3s, pp. 1–32, 2020, ISSN: 1551-6857, 1551-6865.
Abstract | Links | BibTeX | Tags: UARC, Virtual Humans
@article{zhao_affective_2020,
title = {Affective Computing for Large-scale Heterogeneous Multimedia Data: A Survey},
author = {Sicheng Zhao and Shangfei Wang and Mohammad Soleymani and Dhiraj Joshi and Qiang Ji},
url = {https://dl.acm.org/doi/10.1145/3363560},
doi = {10.1145/3363560},
issn = {1551-6857, 1551-6865},
year = {2020},
date = {2020-01-01},
journal = {ACM Transactions on Multimedia Computing, Communications, and Applications},
volume = {15},
number = {3s},
pages = {1–32},
abstract = {The wide popularity of digital photography and social networks has generated a rapidly growing volume of multimedia data (i.e., images, music, and videos), resulting in a great demand for managing, retrieving, and understanding these data. Affective computing (AC) of these data can help to understand human behaviors and enable wide applications. In this article, we survey the state-of-the-art AC technologies comprehensively for large-scale heterogeneous multimedia data. We begin this survey by introducing the typical emotion representation models from psychology that are widely employed in AC. We briefly describe the available datasets for evaluating AC algorithms. We then summarize and compare the representative methods on AC of different multimedia types, i.e., images, music, videos, and multimodal data, with the focus on both handcrafted features-based methods and deep learning methods. Finally, we discuss some challenges and future directions for multimedia affective computing.},
keywords = {UARC, Virtual Humans},
pubstate = {published},
tppubtype = {article}
}
2019
Tavabi, Leili; Stefanov, Kalin; Gilani, Setareh Nasihati; Traum, David; Soleymani, Mohammad
Multimodal Learning for Identifying Opportunities for Empathetic Responses Proceedings Article
In: Proceedings of the 2019 International Conference on Multimodal Interaction, pp. 95–104, ACM, Suzhou China, 2019, ISBN: 978-1-4503-6860-5.
Abstract | Links | BibTeX | Tags: UARC, Virtual Humans
@inproceedings{tavabi_multimodal_2019,
title = {Multimodal Learning for Identifying Opportunities for Empathetic Responses},
author = {Leili Tavabi and Kalin Stefanov and Setareh Nasihati Gilani and David Traum and Mohammad Soleymani},
url = {https://dl.acm.org/doi/10.1145/3340555.3353750},
doi = {10.1145/3340555.3353750},
isbn = {978-1-4503-6860-5},
year = {2019},
date = {2019-10-01},
booktitle = {Proceedings of the 2019 International Conference on Multimodal Interaction},
pages = {95–104},
publisher = {ACM},
address = {Suzhou China},
abstract = {Embodied interactive agents possessing emotional intelligence and empathy can create natural and engaging social interactions. Providing appropriate responses by interactive virtual agents requires the ability to perceive users’ emotional states. In this paper, we study and analyze behavioral cues that indicate an opportunity to provide an empathetic response. Emotional tone in language in addition to facial expressions are strong indicators of dramatic sentiment in conversation that warrant an empathetic response. To automatically recognize such instances, we develop a multimodal deep neural network for identifying opportunities when the agent should express positive or negative empathetic responses. We train and evaluate our model using audio, video and language from human-agent interactions in a wizard-of-Oz setting, using the wizard’s empathetic responses and annotations collected on Amazon Mechanical Turk as ground-truth labels. Our model outperforms a textbased baseline achieving F1-score of 0.71 on a three-class classification. We further investigate the results and evaluate the capability of such a model to be deployed for real-world human-agent interactions.},
keywords = {UARC, Virtual Humans},
pubstate = {published},
tppubtype = {inproceedings}
}
Soleymani, Mohammad; Stefanov, Kalin; Kang, Sin-Hwa; Ondras, Jan; Gratch, Jonathan
Multimodal Analysis and Estimation of Intimate Self-Disclosure Proceedings Article
In: Proceedings of the 2019 International Conference on Multimodal Interaction on - ICMI '19, pp. 59–68, ACM Press, Suzhou, China, 2019, ISBN: 978-1-4503-6860-5.
Abstract | Links | BibTeX | Tags: MxR, UARC, Virtual Humans
@inproceedings{soleymani_multimodal_2019,
title = {Multimodal Analysis and Estimation of Intimate Self-Disclosure},
author = {Mohammad Soleymani and Kalin Stefanov and Sin-Hwa Kang and Jan Ondras and Jonathan Gratch},
url = {http://dl.acm.org/citation.cfm?doid=3340555.3353737},
doi = {10.1145/3340555.3353737},
isbn = {978-1-4503-6860-5},
year = {2019},
date = {2019-10-01},
booktitle = {Proceedings of the 2019 International Conference on Multimodal Interaction on - ICMI '19},
pages = {59–68},
publisher = {ACM Press},
address = {Suzhou, China},
abstract = {Self-disclosure to others has a proven benefit for one’s mental health. It is shown that disclosure to computers can be similarly beneficial for emotional and psychological well-being. In this paper, we analyzed verbal and nonverbal behavior associated with self-disclosure in two datasets containing structured human-human and human-agent interviews from more than 200 participants. Correlation analysis of verbal and nonverbal behavior revealed that linguistic features such as affective and cognitive content in verbal behavior, and nonverbal behavior such as head gestures are associated with intimate self-disclosure. A multimodal deep neural network was developed to automatically estimate the level of intimate self-disclosure from verbal and nonverbal behavior. Between modalities, verbal behavior was the best modality for estimating self-disclosure within-corpora achieving r = 0.66. However, the cross-corpus evaluation demonstrated that nonverbal behavior can outperform language modality in cross-corpus evaluation. Such automatic models can be deployed in interactive virtual agents or social robots to evaluate rapport and guide their conversational strategy.},
keywords = {MxR, UARC, Virtual Humans},
pubstate = {published},
tppubtype = {inproceedings}
}
Ringeval, Fabien; Messner, Eva-Maria; Song, Siyang; Liu, Shuo; Zhao, Ziping; Mallol-Ragolta, Adria; Ren, Zhao; Soleymani, Mohammad; Pantic, Maja; Schuller, Björn; Valstar, Michel; Cummins, Nicholas; Cowie, Roddy; Tavabi, Leili; Schmitt, Maximilian; Alisamir, Sina; Amiriparian, Shahin
AVEC 2019 Workshop and Challenge: State-of-Mind, Detecting Depression with AI, and Cross-Cultural Affect Recognition Proceedings Article
In: Proceedings of the 9th International on Audio/Visual Emotion Challenge and Workshop - AVEC '19, pp. 3–12, ACM Press, Nice, France, 2019, ISBN: 978-1-4503-6913-8.
Abstract | Links | BibTeX | Tags: UARC, Virtual Humans
@inproceedings{ringeval_avec_2019,
title = {AVEC 2019 Workshop and Challenge: State-of-Mind, Detecting Depression with AI, and Cross-Cultural Affect Recognition},
author = {Fabien Ringeval and Eva-Maria Messner and Siyang Song and Shuo Liu and Ziping Zhao and Adria Mallol-Ragolta and Zhao Ren and Mohammad Soleymani and Maja Pantic and Björn Schuller and Michel Valstar and Nicholas Cummins and Roddy Cowie and Leili Tavabi and Maximilian Schmitt and Sina Alisamir and Shahin Amiriparian},
url = {http://dl.acm.org/citation.cfm?doid=3347320.3357688},
doi = {10.1145/3347320.3357688},
isbn = {978-1-4503-6913-8},
year = {2019},
date = {2019-10-01},
booktitle = {Proceedings of the 9th International on Audio/Visual Emotion Challenge and Workshop - AVEC '19},
pages = {3–12},
publisher = {ACM Press},
address = {Nice, France},
abstract = {The Audio/Visual Emotion Challenge and Workshop (AVEC 2019) 'State-of-Mind, Detecting Depression with AI, and Cross-cultural Affect Recognition' is the ninth competition event aimed at the comparison of multimedia processing and machine learning methods for automatic audiovisual health and emotion analysis, with all participants competing strictly under the same conditions. The goal of the Challenge is to provide a common benchmark test set for multimodal information processing and to bring together the health and emotion recognition communities, as well as the audiovisual processing communities, to compare the relative merits of various approaches to health and emotion recognition from real-life data. This paper presents the major novelties introduced this year, the challenge guidelines, the data used, and the performance of the baseline systems on the three proposed tasks: state-of-mind recognition, depression assessment with AI, and cross-cultural affect sensing, respectively.},
keywords = {UARC, Virtual Humans},
pubstate = {published},
tppubtype = {inproceedings}
}
Song, Yale; Soleymani, Mohammad
Polysemous Visual-Semantic Embedding for Cross-Modal Retrieval Proceedings Article
In: Proceedings of the 2019 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10, IEEE, Long Beach, CA, 2019.
Abstract | Links | BibTeX | Tags: UARC, Virtual Humans
@inproceedings{song_polysemous_2019,
title = {Polysemous Visual-Semantic Embedding for Cross-Modal Retrieval},
author = {Yale Song and Mohammad Soleymani},
url = {https://arxiv.org/abs/1906.04402},
year = {2019},
date = {2019-06-01},
booktitle = {Proceedings of the 2019 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
pages = {10},
publisher = {IEEE},
address = {Long Beach, CA},
abstract = {Visual-semantic embedding aims to find a shared latent space where related visual and textual instances are close to each other. Most current methods learn injective embedding functions that map an instance to a single point in the shared space. Unfortunately, injective embedding cannot effectively handle polysemous instances with multiple possible meanings; at best, it would find an average representation of different meanings. This hinders its use in real-world scenarios where individual instances and their cross-modal associations are often ambiguous. In this work, we introduce Polysemous Instance Embedding Networks (PIE-Nets) that compute multiple and diverse representations of an instance by combining global context with locally-guided features via multi-head self-attention and residual learning. To learn visual-semantic embedding, we tie-up two PIE-Nets and optimize them jointly in the multiple instance learning framework. Most existing work on cross-modal retrieval focus on image-text pairs of data. Here, we also tackle a more challenging case of video-text retrieval. To facilitate further research in video-text retrieval, we release a new dataset of 50K video-sentence pairs collected from social media, dubbed MRW (my reaction when). We demonstrate our approach on both image-text and video-text retrieval scenarios using MS-COCO, TGIF, and our new MRW dataset.},
keywords = {UARC, Virtual Humans},
pubstate = {published},
tppubtype = {inproceedings}
}
2018
Rayatdoost, Soheil; Soleymani, Mohammad
CROSS-CORPUS EEG-BASED EMOTION RECOGNITION Proceedings Article
In: 2018 IEEE 28th International Workshop on Machine Learning for Signal Processing (MLSP), pp. 1–6, IEEE, Aalborg, Denmark, 2018, ISBN: 978-1-5386-5477-4.
Abstract | Links | BibTeX | Tags: Virtual Humans
@inproceedings{rayatdoost_cross-corpus_2018,
title = {CROSS-CORPUS EEG-BASED EMOTION RECOGNITION},
author = {Soheil Rayatdoost and Mohammad Soleymani},
url = {https://ieeexplore.ieee.org/document/8517037/},
doi = {10.1109/MLSP.2018.8517037},
isbn = {978-1-5386-5477-4},
year = {2018},
date = {2018-09-01},
booktitle = {2018 IEEE 28th International Workshop on Machine Learning for Signal Processing (MLSP)},
pages = {1–6},
publisher = {IEEE},
address = {Aalborg, Denmark},
abstract = {Lack of generalization is a common problem in automatic emotion recognition. The present study aims to explore the suitability of the existing EEG features for emotion recognition and investigate the performance of emotion recognition methods across different corpora. We introduce a novel dataset which includes spontaneous emotions and was analyzed in addition to the existing datasets for cross-corpus evaluation. We demonstrate that the performance of the existing methods significantly decreases when evaluated across different corpora. The best results are obtained by a convolutional neural network fed by spectral topography maps from different bands. We provide some evidence that stimuli-related sensory information is learned by machine learning models for emotion recognition using EEG signals.},
keywords = {Virtual Humans},
pubstate = {published},
tppubtype = {inproceedings}
}
Aljanaki, Anna; Soleymani, Mohammad
A data-driven approach to mid-level perceptual musical feature modeling Proceedings Article
In: Proceedings of the 19th International Society for Music Information Retrieval Conference, arXiv, Paris, France, 2018.
Abstract | Links | BibTeX | Tags: Virtual Humans
@inproceedings{aljanaki_data-driven_2018,
title = {A data-driven approach to mid-level perceptual musical feature modeling},
author = {Anna Aljanaki and Mohammad Soleymani},
url = {https://arxiv.org/abs/1806.04903},
year = {2018},
date = {2018-09-01},
booktitle = {Proceedings of the 19th International Society for Music Information Retrieval Conference},
publisher = {arXiv},
address = {Paris, France},
abstract = {Musical features and descriptors could be coarsely divided into three levels of complexity. The bottom level contains the basic building blocks of music, e.g., chords, beats and timbre. The middle level contains concepts that emerge from combining the basic blocks: tonal and rhythmic stability, harmonic and rhythmic complexity, etc. High-level descriptors (genre, mood, expressive style) are usually modeled using the lower level ones. The features belonging to the middle level can both improve automatic recognition of high-level descriptors, and provide new music retrieval possibilities. Mid-level features are subjective and usually lack clear definitions. However, they are very important for human perception of music, and on some of them people can reach high agreement, even though defining them and therefore, designing a hand-crafted feature extractor for them can be difficult. In this paper, we derive the mid-level descriptors from data. We collect and release a datasettextbackslashtextbackslashtextbackslashtextbackslashfootnotehttps://osf.io/5aupt/ of 5000 songs annotated by musicians with seven mid-level descriptors, namely, melodiousness, tonal and rhythmic stability, modality, rhythmic complexity, dissonance and articulation. We then compare several approaches to predicting these descriptors from spectrograms using deep-learning. We also demonstrate the usefulness of these mid-level features using music emotion recognition as an application.},
keywords = {Virtual Humans},
pubstate = {published},
tppubtype = {inproceedings}
}