
This DFG-funded project (project number: 531750631) develops new methods for the thematic classification of very large text corpora in the digital humanities and social sciences. Focusing on the German Reference Corpus (DeReKo), the world’s largest collection of German-language texts, it addresses the lack of reliable topic-based metadata for heterogeneous and rapidly growing corpora. By combining approaches from computer science and corpus linguistics, the project creates efficient, open-source, and dynamic classification methods that support advanced corpus analysis, stratified sampling, and the study of linguistic variation. The methods are also tested on domain-specific resources, such as Grammis, to enable fine-grained thematic indexing of specialized texts.
Project Locations
The project is carried out at three research institutions:
- Goethe University Frankfurt am Main
- Project Lead: Prof. Dr. Alexander Mehler
- Saxon Academy of Sciences and Humanities in Leipzig
- Project Lead: Prof. Dr. Gerhard Heyer
- Leibniz Institute for the German Language
- Project Leads: Dr. Marc Kupietz, Prof. Dr. Roman Schneider
Team Frankfurt
Publications
BibTeX
@inproceedings{Abusaleh:et:al:2026:sarf,
title = {TTLab at AraSentEval: SARF (صرف) Sentiment Analysis via Root-based
Fusion for Multi-Dialectal Arabic},
author = {Abusaleh, Ali and Verma, Bhuvanesh and Mehler, Alexander},
booktitle = {Proceedings of the 7th Workshop on Open-Source Arabic Corpora
and Processing Tools (OSACT7), co-located with the Language Resources
and Evaluation Conference (LREC 2026)},
eventdate = {May, 2026},
location = {Palma, Mallorca, Spain},
year = {2026},
keywords = {NLP, Sentiment Analysis, Arabic analysis, new-data-spaces, circlet, satek},
abstract = {Arabic sentiment analysis is challenged by morphological complexity
and lexical variation across Arabic dialects, compounded by subjectivity
in how speakers and writers express sentiment. In this paper,
we present our submission for the AraSentEval 2026 Shared Task
on Arabic Dialect Sentiment Analysis. We propose SARF (صرف) a
multi-view architectural framework that integrates surface-level
context with stemmed and rooted morphological perspectives using
a shared MARBERTv2 encoder. Our system employs a hybrid BERT-CNN-BiLSTM-Attention
architecture to capture both local sentiment n-grams and global
sequential dependencies. Experimental results show that while
individual morphological normalization strategies (stemming or
rooting) may degrade performance, their joint integration via
cross-morphological attention provides robust features across
diverse dialects. Our final system achieved a competitive macro-F1-score
of 0.9263, ranking 2nd out of 15 participating teams.},
note = {accepted}
}
BibTeX
@inproceedings{Verma:Mehler:2026,
title = {Predicting Topic (Co-)Occurrence Using Topic Networks Built from
the Project Gutenberg Corpus},
booktitle = {Proceedings of the 15th International Conference on Language Resources
and Evaluation (LREC 2026)},
year = {2026},
author = {Verma, Bhuvanesh and Mehler, Alexander},
keywords = {Topic Evolution, Topic Network,Time-aware Networks, Temporal Autocorrelation, Project Gutenberg, satek},
abstract = {Although temporal topic modeling has been widely applied to scientific
and legal texts, literary corpora have largely been overlooked
in this regard. To address this issue, we analyze topic evolution
in a subset of the Project Gutenberg (PG) corpus. We model this
subset as a sequence of topic networks that capture the emergence,
persistence, and interaction of thematic structures over decades.
Using supervised topic representations, we predict nodes (topics)
and edges (topic pairings) to forecast future topics and their
co-occurrence. Our experiments demonstrate moderate to strong
temporal persistence in topic connectivity patterns across three
topic systems, with ROC-AUC and AP values consistently above 0.85.
We find that the temporal span of topic networks significantly
impacts predictive performance: longer spans improve the stability
and recall of topic presence, while shorter spans better capture
evolving topic relationships. Overall, our findings demonstrate
the predictability of topics in literary texts over time.},
note = {accepted}
}
BibTeX
@inproceedings{Verma:et:al:2026,
title = {Predicting Convincingness in Political Speech: How Emotional Tone
Shapes Persuasive Strength},
booktitle = {Proceedings of the 15th Workshop on Computational Approaches to
Subjectivity, Sentiment, \& Social Media Analysis},
year = {2026},
author = {Verma, Bhuvanesh and Marreddy, Mounika and Mehler, Alexander},
keywords = {Argument Detection, Argument Quality Assessment,Topic Modelling, Persuasiveness, Convincingness, Emotion Analysis, Argument Mining, satek},
abstract = {Emotional tone plays a central role in persuasion, yet its impact
on computational assessments of political argument quality in
real world election campaign speeches remains understudied. In
this work, we investigate whether positive emotional framing correlates
with higher perceived convincingness in political arguments. We
fine-tune language models on argument quality datasets and test
their ability to transfer convincingness predictions to real-world
campaign speeches. Using a corpus of U.S. presidential campaign
speeches, we analyze emotional polarity in relation to predicted
persuasive strength to test whether positively framed arguments
are judged more convincing than neutral or negative ones. Our
empirical analysis shows that political parties rely heavily on
argumentation during their election campaigns. Also, we found
the evidence that politicians strategically employ emotional cues
within their arguments during these campaign speeches, with positive
emotions being more strongly associated with persuasive strength,
for example in topics such as USMCA’s Effect on American Jobs
and Agriculture, Border Control Policies, Progressive Tax Reforms.
At the same time, we find that negative emotions have a weaker
yet still non-negligible influence on voter persuasion in topics
such as City Crime and Civil Unrest and White Supremacist Violence
(Charlottesville Incident).},
note = {accepted}
}

