We are pleased to inform you about the acceptance of a new paper at IEEE’s 2026 8th International Conference on Natural Language Processing (ICNLP) entitled:
Learning to Detect Cross-Modal Negation: An Analysis of Latent Representations and an Attention-Based Solution
2026.
Learning to Detect Cross-Modal Negation: An Analysis of Latent
Representations and an Attention-Based Solution. 2026 8th International Conference on Natural Language Processing (ICNLP).
accepted.
BibTeX
@inproceedings{Abusaleh:et:al:2026,
title = {Learning to Detect Cross-Modal Negation: An Analysis of Latent
Representations and an Attention-Based Solution},
author = {Abusaleh, Ali and Hammerla, Leon and Mehler, Alexander},
booktitle = {2026 8th International Conference on Natural Language Processing (ICNLP)},
eventdate = {2026-03-20/2026-03-22},
location = {Xi'an,China},
year = {2026},
keywords = {Vision language model, Natural language processing, Cross-modal retrieval, negation detection, video analysis, Multimodal analysis, Political Communication, neglab, new-data-spaces, circlet},
abstract = {Detecting high-level semantic concepts like negation across modalities
remains a challenge for current multimodal systems. We analyze
this as a fundamental representation learning problem, providing
the first evidence that negation does not form a linearly or non-linearly
separable class in the latent spaces of standard vision-language
models (VLMs). We demonstrate that pretrained embeddings primarily
encode modality-specific features, lacking a generalizable negation
signal. To overcome this, we propose a novel cross-modal attention
architecture that explicitly models inter-modal dependencies,
achieving performance gains of up to +7.03% F1 over unimodal baselines.
Our analysis reveals a key asymmetry: while textual negation often
appears independently, visual negation is semantically dependent
on linguistic context, a finding validated through our statistical
analysis of 3,222 political video-text pairs automatically annotated
via Qwen2.5-VL. By combining this analysis with self-supervised
video representations (JEPA2), we advance the modeling of temporal
negation. This work provides new methods and insights for learning
robust, semantically-aligned representations in multimodal systems.},
note = {accepted}
}
