
The New Data Spaces for the Social Sciences programme aims to drive a surge in innovation by improving, enhancing and combining existing panel data infrastructures and emerging data sources to develop new data spaces for social science research. It integrates and consolidates skills, knowledge and expertise from different fields of empirical social research and computer science and provides the means to test new methods and procedures of data generation and data analytics.
www.new-data-spaces.de
In order to more precisely research the major societal challenges of the coming decades, including digitization, climate change, and war- and pandemic-related societal changes, and to be able to identify the need for political action on this basis, the social sciences need innovative research data and methods.
ENTAILab
ENTAILab is the core infrastructural service and research centre of the New Data Spaces programme.
The Research Infrastructure and Innovation Lab (ENTAILab) is dedicated to the use of existing research infrastructures, their advancement and the demand-oriented generation of a new research infrastructure for the needs of the InfPP projects and the development of new data spaces. ENTAILab aims to create a unique infrastructure for research-based innovations in the field of survey data and beyond.
ENTAILab consists of a set of four infrastructure measures that provide a successful and supportive environment for research within and across the projects of InfPP. Together, they will systematically feed results back into different kinds of panel applications and studies and social science research in general.
CIRCLET
ENTAILab involves the implementation, testing and provision of a strong research-oriented tool in the form of a research-driven infrastructure for advanced survey-related data (CIRCLET). CIRCLET will ensure the reproducibility and interoperability of methods working with survey data. This is done through a multi-phase strategy that drives, scales and evaluates the development of methods based on new survey data over the course of InfPP. CIRCLET develops, tests and provides generic services to open up new data and methodological horizons according to the evolving needs of InfPP.
CIRCLET is preferably used by all InfPP projects to share data and methods, test their reproducibility and interoperability, and enrich their methods. Using the Docker Unified UIMA Interface (DUUI), CIRCLET provides a distributed multi-server infrastructure that allows InfPP to containerize methods and facilitate their operation in server clusters to make them reusable. This contributes to the coherence of all InfPP projects and to making innovations available in such a way that they can be reused outside the innovating project as quickly and extensively as possible. Collaboration between projects using CIRCLET as a common platform will be massively strengthened.
CIRCLET is research-driven; it focuses on the needs of the InfPP for which there is currently no or insufficient provision, and go beyond what is offered by the NFDIs with which the InfPP collaborates in order to maximize synergies. CIRCLET includes several means to model and enhance the survey data research cycle: a multimodal data acquisition system, a machine learning system that leverages large language models and related technologies and a hub technology for securing reproducibility.
Publications
BibTeX
@inproceedings{Abusaleh:et:al:2026,
title = {Learning to Detect Cross-Modal Negation: An Analysis of Latent
Representations and an Attention-Based Solution},
author = {Abusaleh, Ali and Hammerla, Leon and Mehler, Alexander},
booktitle = {2026 8th International Conference on Natural Language Processing (ICNLP)},
eventdate = {2026-03-20/2026-03-22},
location = {Xi'an,China},
year = {2026},
keywords = {Vision language model, Natural language processing, Cross-modal retrieval, negation detection, video analysis, Multimodal analysis, Political Communication, neglab, new-data-spaces, circlet},
abstract = {Detecting high-level semantic concepts like negation across modalities
remains a challenge for current multimodal systems. We analyze
this as a fundamental representation learning problem, providing
the first evidence that negation does not form a linearly or non-linearly
separable class in the latent spaces of standard vision-language
models (VLMs). We demonstrate that pretrained embeddings primarily
encode modality-specific features, lacking a generalizable negation
signal. To overcome this, we propose a novel cross-modal attention
architecture that explicitly models inter-modal dependencies,
achieving performance gains of up to +7.03% F1 over unimodal baselines.
Our analysis reveals a key asymmetry: while textual negation often
appears independently, visual negation is semantically dependent
on linguistic context, a finding validated through our statistical
analysis of 3,222 political video-text pairs automatically annotated
via Qwen2.5-VL. By combining this analysis with self-supervised
video representations (JEPA2), we advance the modeling of temporal
negation. This work provides new methods and insights for learning
robust, semantically-aligned representations in multimodal systems.},
note = {accepted}
}
BibTeX
@article{Borkowski:et:al:2026,
title = {{DUUIgateway}: A Web Service for Platform-independent, Ubiquitous Big Data NLP},
journal = {SoftwareX},
volume = {34},
pages = {102549},
year = {2026},
issn = {2352-7110},
doi = {https://doi.org/10.1016/j.softx.2026.102549},
url = {https://www.sciencedirect.com/science/article/pii/S2352711026000439},
author = {Borkowski, Cedric and Abrami, Giuseppe and Terefe, Dawit and Baumartz, Daniel
and Mehler, Alexander},
keywords = {duui, neglab, core, core_b05, core_c08, new-data-spaces, circlet},
abstract = {Distributed processing of unstructured text data is a challenge
in the rapidly changing and evolving natural language processing
(NLP) landscape. This landscape is characterized by heterogeneous
systems, models, and formats, and especially by the increasing
influence of AI systems. While many of these systems handle text
data, there are also unified systems that process multiple input
and output formats, while allowing for distributed corpus processing.
However, there are hardly any user-friendly interfaces that allow
existing NLP frameworks to be used flexibly and extended in a
user-controlled manner. Due to this gap and the increasing importance
of NLP for various scientific disciplines, there has been a demand
for a web and API based flexible software solution for deploying,
managing and monitoring NLP systems. Such a solution is provided
by Docker Unified UIMA-gateway. We introduce DUUIgateway and evaluate
its API and user-driven approach to encapsulation. We also describe
how these features improve the usability and accessibility of
the NLP framework DUUI. We illustrate DUUIgateway in the field
of process modeling in higher education and show how it closes
the latter gap in NLP by making a variety of systems for processing
text and multimodal data accessible to non-experts.}
}
BibTeX
@inproceedings{Bundan:Abrami:Mehler:2025,
author = {Bundan, Daniel and Abrami, Giuseppe and Mehler, Alexander},
title = {Multimodal Docker Unified {UIMA} Interface: New Horizons for Distributed
Microservice-Oriented Processing of Corpora using {UIMA}},
booktitle = {Proceedings of the 21st Conference on Natural Language Processing
(KONVENS 2025): Long and Short Papers},
year = {2025},
editor = {Wartena, Christian and Heid, Ulrich},
location = {Hildesheim, Germany},
address = {Hannover, Germany},
publisher = {HsH Applied Academics},
pages = {257--268},
series = {KONVENS '25},
url = {https://aclanthology.org/2025.konvens-1.22/},
pdf = {https://aclanthology.org/2025.konvens-1.22.pdf},
poster = {https://www.texttechnologylab.org/wp-content/uploads/2025/09/Poster_Multimodal_DUUI_KONVENS_2025.pdf},
keywords = {duui,neglab,new-data-spaces,circlet}
}
BibTeX
@inproceedings{Boenisch:et:al:2025,
title = {Towards Unified, Dynamic and Annotation-based Visualisations and
Exploration of Annotated Big Data Corpora with the Help of Unified
Corpus Explorer},
author = {B{\"o}nisch, Kevin and Abrami, Giuseppe and Mehler, Alexander},
editor = {Dziri, Nouha and Ren, Sean (Xiang) and Diao, Shizhe},
booktitle = {Proceedings of the 2025 Conference of the Nations of the Americas
Chapter of the Association for Computational Linguistics: Human
Language Technologies (System Demonstrations)},
year = {2025},
address = {Albuquerque, New Mexico},
publisher = {Association for Computational Linguistics},
url = {https://aclanthology.org/2025.naacl-demo.42/},
pages = {522--534},
isbn = {979-8-89176-191-9},
abstract = {The annotation and exploration of large text corpora, both automatic
and manual, presents significant challenges across multiple disciplines,
including linguistics, digital humanities, biology, and legal
science. These challenges are exacerbated by the heterogeneity
of processing methods, which complicates corpus visualization,
interaction, and integration. To address these issues, we introduce
the Unified Corpus Explorer (UCE), a standardized, dockerized,
open-source and dynamic Natural Language Processing (NLP) application
designed for flexible and scalable corpus navigation. Herein,
UCE utilizes the UIMA format for NLP annotations as a standardized
input, constructing interfaces and features around those annotations
while dynamically adapting to the corpora and their extracted
annotations. We evaluate UCE based on a user study and demonstrate
its versatility as a corpus explorer based on generative AI.},
note = {Best Demo Award},
pdf = {https://aclanthology.org/2025.naacl-demo.42.pdf},
keywords = {uce,new-data-spaces,circlet,core,core_c08}
}
BibTeX
@article{Abrami:et:al:2025:a,
title = {Docker Unified UIMA Interface: New perspectives for NLP on big data},
journal = {SoftwareX},
volume = {29},
pages = {102033},
year = {2025},
issn = {2352-7110},
doi = {https://doi.org/10.1016/j.softx.2024.102033},
url = {https://www.sciencedirect.com/science/article/pii/S2352711024004047},
author = {Giuseppe Abrami and Markos Genios and Filip Fitzermann and Daniel Baumartz
and Alexander Mehler},
keywords = {Docker, Kubernetes, UIMA, Distributed NLP, duui, biofid, neglab, new-data-spaces, circlet, core, core_c08},
abstract = {Processing large amounts of natural language text using machine
learning-based models is becoming important in many disciplines.
This demand is being met by a variety of approaches, resulting
in the heterogeneous deployment of separate, partly incompatible,
not natively scalable applications. To overcome the technological
bottleneck involved, we have developed Docker Unified UIMA Interface,
a system for the standardized, parallel, platform-independent,
distributed and microservices-based solution for processing large
and extensive text corpora with any NLP method. We present DUUI
as a framework that enables automated orchestration of GPU-based
NLP processes beyond the existing Docker Swarm cluster variant,
and in addition to the adaptation to new runtime environments
such as Kubernetes. Therefore, a new driver for DUUI is introduced,
which enables the lightweight orchestration of DUUI processes
within a Kubernetes environment in a scalable setup. In this way,
the paper opens up novel text-technological perspectives for existing
practices in disciplines that deal with the scientific analysis
of large amounts of data based on NLP.}
}
News
-
Best Demo Award at NAACL 2025
by


We are delighted that our paper “Towards Unified, Dynamic, and Annotation-based Visualizations and Exploration of Annotated Big Data Corpora with the Help of Unified Corpus Explorer” has been awarded the Best Demo Paper at this year’s annual conference of the Nations of the Americas Chapter of the Association for Computational Linguistics (NAACL 2025).
2025. Towards Unified, Dynamic and Annotation-based Visualisations and Exploration of Annotated Big Data Corpora with the Help of Unified Corpus Explorer. Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (System Demonstrations), 522–534. Best Demo Award.BibTeX@inproceedings{Boenisch:et:al:2025, title = {Towards Unified, Dynamic and Annotation-based Visualisations and Exploration of Annotated Big Data Corpora with the Help of Unified Corpus Explorer}, author = {B{\"o}nisch, Kevin and Abrami, Giuseppe and Mehler, Alexander}, editor = {Dziri, Nouha and Ren, Sean (Xiang) and Diao, Shizhe}, booktitle = {Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (System Demonstrations)}, year = {2025}, address = {Albuquerque, New Mexico}, publisher = {Association for Computational Linguistics}, url = {https://aclanthology.org/2025.naacl-demo.42/}, pages = {522--534}, isbn = {979-8-89176-191-9}, abstract = {The annotation and exploration of large text corpora, both automatic and manual, presents significant challenges across multiple disciplines, including linguistics, digital humanities, biology, and legal science. These challenges are exacerbated by the heterogeneity of processing methods, which complicates corpus visualization, interaction, and integration. To address these issues, we introduce the Unified Corpus Explorer (UCE), a standardized, dockerized, open-source and dynamic Natural Language Processing (NLP) application designed for flexible and scalable corpus navigation. Herein, UCE utilizes the UIMA format for NLP annotations as a standardized input, constructing interfaces and features around those annotations while dynamically adapting to the corpora and their extracted annotations. We evaluate UCE based on a user study and demonstrate its versatility as a corpus explorer based on generative AI.}, note = {Best Demo Award}, pdf = {https://aclanthology.org/2025.naacl-demo.42.pdf}, keywords = {uce,new-data-spaces,circlet,core,core_c08} }
