Publications
Search
Tavabi, Leili; Stefanov, Kalin; Gilani, Setareh Nasihati; Traum, David; Soleymani, Mohammad
Multimodal Learning for Identifying Opportunities for Empathetic Responses Proceedings Article
In: Proceedings of the 2019 International Conference on Multimodal Interaction, pp. 95–104, ACM, Suzhou China, 2019, ISBN: 978-1-4503-6860-5.
@inproceedings{tavabi_multimodal_2019,
title = {Multimodal Learning for Identifying Opportunities for Empathetic Responses},
author = {Leili Tavabi and Kalin Stefanov and Setareh Nasihati Gilani and David Traum and Mohammad Soleymani},
url = {https://dl.acm.org/doi/10.1145/3340555.3353750},
doi = {10.1145/3340555.3353750},
isbn = {978-1-4503-6860-5},
year = {2019},
date = {2019-10-01},
booktitle = {Proceedings of the 2019 International Conference on Multimodal Interaction},
pages = {95–104},
publisher = {ACM},
address = {Suzhou China},
abstract = {Embodied interactive agents possessing emotional intelligence and empathy can create natural and engaging social interactions. Providing appropriate responses by interactive virtual agents requires the ability to perceive users’ emotional states. In this paper, we study and analyze behavioral cues that indicate an opportunity to provide an empathetic response. Emotional tone in language in addition to facial expressions are strong indicators of dramatic sentiment in conversation that warrant an empathetic response. To automatically recognize such instances, we develop a multimodal deep neural network for identifying opportunities when the agent should express positive or negative empathetic responses. We train and evaluate our model using audio, video and language from human-agent interactions in a wizard-of-Oz setting, using the wizard’s empathetic responses and annotations collected on Amazon Mechanical Turk as ground-truth labels. Our model outperforms a textbased baseline achieving F1-score of 0.71 on a three-class classification. We further investigate the results and evaluate the capability of such a model to be deployed for real-world human-agent interactions.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Soleymani, Mohammad; Stefanov, Kalin; Kang, Sin-Hwa; Ondras, Jan; Gratch, Jonathan
Multimodal Analysis and Estimation of Intimate Self-Disclosure Proceedings Article
In: Proceedings of the 2019 International Conference on Multimodal Interaction on - ICMI '19, pp. 59–68, ACM Press, Suzhou, China, 2019, ISBN: 978-1-4503-6860-5.
@inproceedings{soleymani_multimodal_2019,
title = {Multimodal Analysis and Estimation of Intimate Self-Disclosure},
author = {Mohammad Soleymani and Kalin Stefanov and Sin-Hwa Kang and Jan Ondras and Jonathan Gratch},
url = {http://dl.acm.org/citation.cfm?doid=3340555.3353737},
doi = {10.1145/3340555.3353737},
isbn = {978-1-4503-6860-5},
year = {2019},
date = {2019-10-01},
booktitle = {Proceedings of the 2019 International Conference on Multimodal Interaction on - ICMI '19},
pages = {59–68},
publisher = {ACM Press},
address = {Suzhou, China},
abstract = {Self-disclosure to others has a proven benefit for one’s mental health. It is shown that disclosure to computers can be similarly beneficial for emotional and psychological well-being. In this paper, we analyzed verbal and nonverbal behavior associated with self-disclosure in two datasets containing structured human-human and human-agent interviews from more than 200 participants. Correlation analysis of verbal and nonverbal behavior revealed that linguistic features such as affective and cognitive content in verbal behavior, and nonverbal behavior such as head gestures are associated with intimate self-disclosure. A multimodal deep neural network was developed to automatically estimate the level of intimate self-disclosure from verbal and nonverbal behavior. Between modalities, verbal behavior was the best modality for estimating self-disclosure within-corpora achieving r = 0.66. However, the cross-corpus evaluation demonstrated that nonverbal behavior can outperform language modality in cross-corpus evaluation. Such automatic models can be deployed in interactive virtual agents or social robots to evaluate rapport and guide their conversational strategy.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Ringeval, Fabien; Messner, Eva-Maria; Song, Siyang; Liu, Shuo; Zhao, Ziping; Mallol-Ragolta, Adria; Ren, Zhao; Soleymani, Mohammad; Pantic, Maja; Schuller, Björn; Valstar, Michel; Cummins, Nicholas; Cowie, Roddy; Tavabi, Leili; Schmitt, Maximilian; Alisamir, Sina; Amiriparian, Shahin
AVEC 2019 Workshop and Challenge: State-of-Mind, Detecting Depression with AI, and Cross-Cultural Affect Recognition Proceedings Article
In: Proceedings of the 9th International on Audio/Visual Emotion Challenge and Workshop - AVEC '19, pp. 3–12, ACM Press, Nice, France, 2019, ISBN: 978-1-4503-6913-8.
@inproceedings{ringeval_avec_2019,
title = {AVEC 2019 Workshop and Challenge: State-of-Mind, Detecting Depression with AI, and Cross-Cultural Affect Recognition},
author = {Fabien Ringeval and Eva-Maria Messner and Siyang Song and Shuo Liu and Ziping Zhao and Adria Mallol-Ragolta and Zhao Ren and Mohammad Soleymani and Maja Pantic and Björn Schuller and Michel Valstar and Nicholas Cummins and Roddy Cowie and Leili Tavabi and Maximilian Schmitt and Sina Alisamir and Shahin Amiriparian},
url = {http://dl.acm.org/citation.cfm?doid=3347320.3357688},
doi = {10.1145/3347320.3357688},
isbn = {978-1-4503-6913-8},
year = {2019},
date = {2019-10-01},
booktitle = {Proceedings of the 9th International on Audio/Visual Emotion Challenge and Workshop - AVEC '19},
pages = {3–12},
publisher = {ACM Press},
address = {Nice, France},
abstract = {The Audio/Visual Emotion Challenge and Workshop (AVEC 2019) 'State-of-Mind, Detecting Depression with AI, and Cross-cultural Affect Recognition' is the ninth competition event aimed at the comparison of multimedia processing and machine learning methods for automatic audiovisual health and emotion analysis, with all participants competing strictly under the same conditions. The goal of the Challenge is to provide a common benchmark test set for multimodal information processing and to bring together the health and emotion recognition communities, as well as the audiovisual processing communities, to compare the relative merits of various approaches to health and emotion recognition from real-life data. This paper presents the major novelties introduced this year, the challenge guidelines, the data used, and the performance of the baseline systems on the three proposed tasks: state-of-mind recognition, depression assessment with AI, and cross-cultural affect sensing, respectively.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Song, Yale; Soleymani, Mohammad
Polysemous Visual-Semantic Embedding for Cross-Modal Retrieval Proceedings Article
In: Proceedings of the 2019 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10, IEEE, Long Beach, CA, 2019.
@inproceedings{song_polysemous_2019,
title = {Polysemous Visual-Semantic Embedding for Cross-Modal Retrieval},
author = {Yale Song and Mohammad Soleymani},
url = {https://arxiv.org/abs/1906.04402},
year = {2019},
date = {2019-06-01},
booktitle = {Proceedings of the 2019 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
pages = {10},
publisher = {IEEE},
address = {Long Beach, CA},
abstract = {Visual-semantic embedding aims to find a shared latent space where related visual and textual instances are close to each other. Most current methods learn injective embedding functions that map an instance to a single point in the shared space. Unfortunately, injective embedding cannot effectively handle polysemous instances with multiple possible meanings; at best, it would find an average representation of different meanings. This hinders its use in real-world scenarios where individual instances and their cross-modal associations are often ambiguous. In this work, we introduce Polysemous Instance Embedding Networks (PIE-Nets) that compute multiple and diverse representations of an instance by combining global context with locally-guided features via multi-head self-attention and residual learning. To learn visual-semantic embedding, we tie-up two PIE-Nets and optimize them jointly in the multiple instance learning framework. Most existing work on cross-modal retrieval focus on image-text pairs of data. Here, we also tackle a more challenging case of video-text retrieval. To facilitate further research in video-text retrieval, we release a new dataset of 50K video-sentence pairs collected from social media, dubbed MRW (my reaction when). We demonstrate our approach on both image-text and video-text retrieval scenarios using MS-COCO, TGIF, and our new MRW dataset.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Rayatdoost, Soheil; Soleymani, Mohammad
CROSS-CORPUS EEG-BASED EMOTION RECOGNITION Proceedings Article
In: 2018 IEEE 28th International Workshop on Machine Learning for Signal Processing (MLSP), pp. 1–6, IEEE, Aalborg, Denmark, 2018, ISBN: 978-1-5386-5477-4.
@inproceedings{rayatdoost_cross-corpus_2018,
title = {CROSS-CORPUS EEG-BASED EMOTION RECOGNITION},
author = {Soheil Rayatdoost and Mohammad Soleymani},
url = {https://ieeexplore.ieee.org/document/8517037/},
doi = {10.1109/MLSP.2018.8517037},
isbn = {978-1-5386-5477-4},
year = {2018},
date = {2018-09-01},
booktitle = {2018 IEEE 28th International Workshop on Machine Learning for Signal Processing (MLSP)},
pages = {1–6},
publisher = {IEEE},
address = {Aalborg, Denmark},
abstract = {Lack of generalization is a common problem in automatic emotion recognition. The present study aims to explore the suitability of the existing EEG features for emotion recognition and investigate the performance of emotion recognition methods across different corpora. We introduce a novel dataset which includes spontaneous emotions and was analyzed in addition to the existing datasets for cross-corpus evaluation. We demonstrate that the performance of the existing methods significantly decreases when evaluated across different corpora. The best results are obtained by a convolutional neural network fed by spectral topography maps from different bands. We provide some evidence that stimuli-related sensory information is learned by machine learning models for emotion recognition using EEG signals.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Aljanaki, Anna; Soleymani, Mohammad
A data-driven approach to mid-level perceptual musical feature modeling Proceedings Article
In: Proceedings of the 19th International Society for Music Information Retrieval Conference, arXiv, Paris, France, 2018.
@inproceedings{aljanaki_data-driven_2018,
title = {A data-driven approach to mid-level perceptual musical feature modeling},
author = {Anna Aljanaki and Mohammad Soleymani},
url = {https://arxiv.org/abs/1806.04903},
year = {2018},
date = {2018-09-01},
booktitle = {Proceedings of the 19th International Society for Music Information Retrieval Conference},
publisher = {arXiv},
address = {Paris, France},
abstract = {Musical features and descriptors could be coarsely divided into three levels of complexity. The bottom level contains the basic building blocks of music, e.g., chords, beats and timbre. The middle level contains concepts that emerge from combining the basic blocks: tonal and rhythmic stability, harmonic and rhythmic complexity, etc. High-level descriptors (genre, mood, expressive style) are usually modeled using the lower level ones. The features belonging to the middle level can both improve automatic recognition of high-level descriptors, and provide new music retrieval possibilities. Mid-level features are subjective and usually lack clear definitions. However, they are very important for human perception of music, and on some of them people can reach high agreement, even though defining them and therefore, designing a hand-crafted feature extractor for them can be difficult. In this paper, we derive the mid-level descriptors from data. We collect and release a datasettextbackslashtextbackslashtextbackslashtextbackslashfootnotehttps://osf.io/5aupt/ of 5000 songs annotated by musicians with seven mid-level descriptors, namely, melodiousness, tonal and rhythmic stability, modality, rhythmic complexity, dissonance and articulation. We then compare several approaches to predicting these descriptors from spectrograms using deep-learning. We also demonstrate the usefulness of these mid-level features using music emotion recognition as an application.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Filter
Sorry, no publications matched your criteria.