The following papers have been accepted for publication in the proceedings of the Language Resources and Evaluation Conference 2026.
GhostWriter: Hidden AI-Generated Texts Over Multiple Languages, Domains and Generators
BibTeX
@inproceedings{Schaaf:et:al:2026,
title = {GhostWriter: Hidden AI-Generated Texts over Multiple Languages,
Domains and Generators},
author = {Schaaf, Manuel and Bönisch, Kevin and Mehler, Alexander},
booktitle = {Proceedings of the Fifteenth Language Resources and Evaluation
Conference (LREC 2026)},
month = {May},
year = {2026},
pages = {10497--10516},
keywords = {Corpus, Natural Language Generation; Validation of LRs, AI-generated Text Detection, core, core_b05},
address = {Palma, Mallorca, Spain},
publisher = {European Language Resources Association (ELRA)},
editor = {Piperidis, Stelios and Bel, Núria and van den Heuvel, Henk and Ide, Nancy
and Krek, Simon and Toral, Antonio},
doi = {10.63317/57fd7juh5zek},
abstract = {The advent of Transformer-based Large Language Models (LLMs) has
led to an unprecedented surge of AI-generated text (AIGT) across
online platforms and academic domains. While these models exhibit
near-human fluency and stylistic coherence, their widespread adoption
has raised concerns about authorship integrity, research quality,
and the recursive contamination of training corpora with synthetic
data. These developments underscore the need for reliable AIGT
detection methods and benchmark datasets, particularly for malicious
or deceptive *ghostwriting* scenarios where AIGT is intentionally
crafted to evade detection. To address this, we present **GhostWriter**,
a large-scale, bilingual (German and English), multi-generator,
and multi-domain dataset for AIGT detection. The dataset comprises
human- and AI-authored texts produced under domain-specific *ghostwriting*
conditions, including examples intentionally embedded within otherwise
human-written texts to obscure their AI origin. With **GhostWriter**,
we (i) aim to expand the resources available for German AIGT datasets,
(ii) emphasize mixed or fused synthesizations—since most existing
corpora are limited to the document level—and (iii) introduce
specifically crafted malicious ghostwriting scenarios across multiple
domains and generators.}
}
Towards the Generation and Application of Dynamic Web-Based Visualization of UIMA-based Annotations for Big-Data Corpora with the Help of Unified Dynamic Annotation Visualizer
BibTeX
@inproceedings{Dahmann:et:al:2026,
title = {Towards the Generation and Application of Dynamic Web-Based Visualization
of UIMA-based Annotations for Big-Data Corpora with the Help of
Unified Dynamic Annotation Visualizer},
booktitle = {Proceedings of the Fifteenth Language Resources and Evaluation
Conference (LREC 2026)},
year = {2026},
pages = {6695--6705},
author = {Dahmann, Thiemo and Schneider, Julian and Stephan, Philipp and Abrami, Giuseppe
and Mehler, Alexander},
address = {Palma, Mallorca, Spain},
publisher = {European Language Resources Association (ELRA)},
editor = {Piperidis, Stelios and Bel, Núria and van den Heuvel, Henk and Ide, Nancy
and Krek, Simon and Toral, Antonio},
doi = {10.63317/5ce2aaity4yz},
keywords = {NLP, UIMA, Annotations, dynamic visualization, uce},
abstract = {The automatic and manual annotation of unstructured corpora is
a routine task in many scientific fields and is supported by a
variety of existing software solutions. Despite this variety,
few solutions currently support annotation visualization, especially
for dynamic generation and interaction. To bridge this gap and
visualize annotated corpora based on user-, project-, or corpus-specific
aspects, we developed Unified Dynamic Annotation Visualizer (UDAV).
UDAV is a web-based solution that implements features not supported
by comparable tools, enabling a customizable and extensible toolbox
for interacting with annotations and allowing integration into
existing big-data frameworks. We exemplify UDAV through a range
of visualizations and also provide an evaluation of corpus import
and processing performance.},
pdf = {http://www.lrec-conf.org/proceedings/lrec2026/pdf/2026.lrec2026-1.533.pdf},
video = {https://www.youtube.com/watch?v=LFBiGlmEDog}
}
Predicting Topic (Co-)Occurrence Using Topic Networks Built from the Project Gutenberg Corpus
BibTeX
@inproceedings{Verma:Mehler:2026,
title = {Predicting Topic (Co-)Occurrence Using Topic Networks Built from
the Project Gutenberg Corpus},
booktitle = {Proceedings of the Fifteenth Language Resources and Evaluation
Conference (LREC 2026)},
pages = {860--869},
address = {Palma, Mallorca, Spain},
publisher = {European Language Resources Association (ELRA)},
editor = {Piperidis, Stelios and Bel, Núria and van den Heuvel, Henk and Ide, Nancy
and Krek, Simon and Toral, Antonio},
year = {2026},
author = {Verma, Bhuvanesh and Mehler, Alexander},
doi = {10.63317/58x3h7gjbpb4},
keywords = {Topic Evolution, Topic Network,Time-aware Networks, Temporal Autocorrelation, Project Gutenberg, satek},
abstract = {Although temporal topic modeling has been widely applied to scientific
and legal texts, literary corpora have largely been overlooked
in this regard. To address this issue, we analyze topic evolution
in a subset of the Project Gutenberg (PG) corpus. We model this
subset as a sequence of topic networks that capture the emergence,
persistence, and interaction of thematic structures over decades.
Using supervised topic representations, we predict nodes (topics)
and edges (topic pairings) to forecast future topics and their
co-occurrence. Our experiments demonstrate moderate to strong
temporal persistence in topic connectivity patterns across three
topic systems, with ROC-AUC and AP values consistently above 0.85.
We find that the temporal span of topic networks significantly
impacts predictive performance: longer spans improve the stability
and recall of topic presence, while shorter spans better capture
evolving topic relationships. Overall, our findings demonstrate
the predictability of topics in literary texts over time.} pdf
= {http://www.lrec-conf.org/proceedings/lrec2026/pdf/2026.lrec2026-1.65.pdf}
}
