Leon Hammerla – Text Technology Lab

PhD Student

Contact

Goethe-Universität Frankfurt am Main
Robert-Mayer-Straße 10
Room 401b
D-60325 Frankfurt am Main
D-60054 Frankfurt am Main (use for package delivery)
Postfach / P.O. Box: 154
Phone:
Mail:

Leon Hammerla @LeonHammerla

35 Repos 0 Gists 1 Followers

Project

Negation in Language and Beyond (NegLaB). 2024 – . Funded by DFG (SFB 1629).

NegLab Publications

Leon Hammerla and Alexander Mehler. 2026. Gutenberg+: A More Temporally Faithful Corpus for Diachronic NLP. Proceedings Workshop on Structured Linguistic Data and Evaluation (SLiDE 2026), co-located with the Language Resources and Evaluation Conference (LREC 2026). accepted.

BibTeX

@inproceedings{Hammerla:Mehler:2026:a,
  title     = {{Gutenberg+}: A More Temporally Faithful Corpus for Diachronic {NLP}},
  author    = {Leon Hammerla and Alexander Mehler},
  booktitle = {Proceedings Workshop on Structured Linguistic Data and Evaluation
               (SLiDE 2026), co-located with the Language Resources and Evaluation
               Conference (LREC 2026)},
  address   = {Palma de Mallorca (Spain)},
  year      = {2026},
  keywords  = {neglab},
  note      = {accepted}
}

Ali Abusaleh, Leon Hammerla and Alexander Mehler. 2026. Learning to Detect Cross-Modal Negation: An Analysis of Latent Representations and an Attention-Based Solution. 2026 8th International Conference on Natural Language Processing (ICNLP). accepted.

BibTeX

@inproceedings{Abusaleh:et:al:2026,
  title     = {Learning to Detect Cross-Modal Negation: An Analysis of Latent
               Representations and an Attention-Based Solution},
  author    = {Abusaleh, Ali and Hammerla, Leon and Mehler, Alexander},
  booktitle = {2026 8th International Conference on Natural Language Processing (ICNLP)},
  eventdate = {2026-03-20/2026-03-22},
  location  = {Xi'an,China},
  year      = {2026},
  keywords  = {Vision language model, Natural language processing, Cross-modal retrieval, negation detection, video analysis, Multimodal analysis, Political Communication, neglab, new-data-spaces, circlet},
  abstract  = {Detecting high-level semantic concepts like negation across modalities
               remains a challenge for current multimodal systems. We analyze
               this as a fundamental representation learning problem, providing
               the first evidence that negation does not form a linearly or non-linearly
               separable class in the latent spaces of standard vision-language
               models (VLMs). We demonstrate that pretrained embeddings primarily
               encode modality-specific features, lacking a generalizable negation
               signal. To overcome this, we propose a novel cross-modal attention
               architecture that explicitly models inter-modal dependencies,
               achieving performance gains of up to +7.03% F1 over unimodal baselines.
               Our analysis reveals a key asymmetry: while textual negation often
               appears independently, visual negation is semantically dependent
               on linguistic context, a finding validated through our statistical
               analysis of 3,222 political video-text pairs automatically annotated
               via Qwen2.5-VL. By combining this analysis with self-supervised
               video representations (JEPA2), we advance the modeling of temporal
               negation. This work provides new methods and insights for learning
               robust, semantically-aligned representations in multimodal systems.},
  note      = {accepted}
}

Andy Lücking, Leon Hammerla and Alexander Mehler. 2026. Not every quantifier can be negated. Proceedings of Sinn und Bedeutung, Special Session “Philosophical and Linguistic Approaches to Negation (PhilLingNeg)”. accepted.

BibTeX

@inproceedings{Luecking:Hammerla:Mehler:2026,
  author    = {Lücking, Andy and Hammerla, Leon and Mehler, Alexander},
  title     = {Not every quantifier can be negated},
  booktitle = {Proceedings of \textit{Sinn und Bedeutung}, Special Session ``Philosophical
               and Linguistic Approaches to Negation (PhilLingNeg)''},
  series    = {SuB'30},
  location  = {Frankfurt am Main},
  year      = {2026},
  pubstate  = {forthcoming},
  keywords  = {neglab},
  note      = {accepted}
}

Leon Hammerla, Andy Lücking, Carolin Reinert and Alexander Mehler. December, 2025. D-Neg: Syntax-Aware Graph Reasoning for Negation Detection. Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics, 1432–1454.

BibTeX

@inproceedings{Hammerla:et:al:2025b,
  author    = {Hammerla, Leon and Lücking, Andy and Reinert, Carolin and Mehler, Alexander},
  title     = {{D}-Neg: Syntax-Aware Graph Reasoning for Negation Detection},
  editor    = {Inui, Kentaro and Sakti, Sakriani and Wang, Haofen and Wong, Derek F.
               and Bhattacharyya, Pushpak and Banerjee, Biplab and Ekbal, Asif and Chakraborty, Tanmoy
               and Singh, Dhirendra Pratap},
  booktitle = {Proceedings of the 14th International Joint Conference on Natural
               Language Processing and the 4th Conference of the Asia-Pacific
               Chapter of the Association for Computational Linguistics},
  month     = {dec},
  year      = {2025},
  address   = {Mumbai, India},
  publisher = {The Asian Federation of Natural Language Processing and The Association for Computational Linguistics},
  url       = {https://aclanthology.org/2025.findings-ijcnlp.89/},
  pages     = {1432--1454},
  isbn      = {979-8-89176-303-6},
  abstract  = {Despite the communicative importance of negation, its detection
               remains challenging. Previous approaches perform poorly in out-of-domain
               scenarios, and progress outside of English has been slow due to
               a lack of resources and robust models. To address this gap, we
               present D-Neg: a syntax-aware graph reasoning model based on a
               transformer that incorporates syntactic embeddings by attention-gating.
               D-Neg uses graph attention to represent syntactic structures,
               emulating the effectiveness of rule-based dependency approaches
               for negation detection. We train D-Neg using 7 English resources
               and their translations into 10 languages, all aligned at the annotation
               level. We conduct an evaluation of all these datasets in in-domain
               and out-of-domain settings. Our work represents a significant
               advance in negation detection, enabling more effective cross-lingual
               research.},
  keywords  = {neglab}
}

Leon Hammerla, Alexander Mehler and Giuseppe Abrami. December, 2025. Standardizing Heterogeneous Corpora with DUUR: A Dual Data- and Process-Oriented Approach to Enhancing NLP Pipeline Integration. Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics, 1410–1425.

BibTeX

@inproceedings{Hammerla:et:al:2025a,
  author    = {Hammerla, Leon and Mehler, Alexander and Abrami, Giuseppe},
  title     = {Standardizing Heterogeneous Corpora with {DUUR}: A Dual Data-
               and Process-Oriented Approach to Enhancing NLP Pipeline Integration},
  editor    = {Inui, Kentaro and Sakti, Sakriani and Wang, Haofen and Wong, Derek F.
               and Bhattacharyya, Pushpak and Banerjee, Biplab and Ekbal, Asif and Chakraborty, Tanmoy
               and Singh, Dhirendra Pratap},
  booktitle = {Proceedings of the 14th International Joint Conference on Natural
               Language Processing and the 4th Conference of the Asia-Pacific
               Chapter of the Association for Computational Linguistics},
  month     = {dec},
  year      = {2025},
  address   = {Mumbai, India},
  publisher = {The Asian Federation of Natural Language Processing and The Association for Computational Linguistics},
  url       = {https://aclanthology.org/2025.findings-ijcnlp.87/},
  pages     = {1410--1425},
  isbn      = {979-8-89176-303-6},
  abstract  = {Despite their success, LLMs are too computationally expensive
               to replace task- or domain-specific NLP systems. However, the
               variety of corpus formats makes reusing these systems difficult.
               This underscores the importance of maintaining an interoperable
               NLP landscape. We address this challenge by pursuing two objectives:
               standardizing corpus formats and enabling massively parallel corpus
               processing. We present a unified conversion framework embedded
               in a massively parallel, microservice-based, programming language-independent
               NLP architecture designed for modularity and extensibility. It
               allows for the integration of external NLP conversion tools and
               supports the addition of new components that meet basic compatibility
               requirements. To evaluate our dual data- and process-oriented
               approach to standardization, we (1) benchmark its efficiency in
               terms of processing speed and memory usage, (2) demonstrate the
               benefits of standardized corpus formats for NLP downstream tasks,
               and (3) illustrate the advantages of incorporating custom formats
               into a corpus format ecosystem.},
  keywords  = {neglab,duui}
}

Thesis topic proposals

2025

Master Thesis: Negation and LLM Reasoning.

Description

As lexical and logical negation appears to play a crucial role in human reasoning and inquiry, we are interested in analyzing negation patterns in reasoning traces produced by large language models (LLMs), as well as in LLM reasoning frameworks that explicitly incorporate negation, with the goal of better mimicking human reasoning. Possible directions for this thesis include: (1) The development of LLM reasoning frameworks centered around the phenomenon of negation and their evaluation against existing frameworks such as Chain-of-Thought (CoT) or Tree-of-Thought (ToT). (2) Negation-centered fine-tuning of LLM reasoning. (3) Qualitative and quantitative analysis of reasoning traces produced by LLMs, focusing on negation patterns.
Corresponding Lab Member: Leon Hammerla and Alexander Mehler.

Bachelor Thesis: Detecting the negated Event/Detecting the Focus of Negation.

Description

Classical negation annotation in computational linguistics involves identifying the negation cue, determining the scope of the negation, and detecting both the negated event and the most prominent part of the scope that is negated (the focus). While reliable systems already exist for detecting negation cues and scopes, current frameworks need to be extended to identify the negated event and/or the focus. For a bachelor thesis, addressing one of these two aspects is sufficient; for a master thesis, both should be tackled. A Python-based pipeline for cue and scope detection is already available, and the newly developed detection modules can be integrated into this existing framework (python).
Corresponding Lab Member: Leon Hammerla and Alexander Mehler.

If you have any suggestions of your own relating to this or our other proposed topics, please do not hesitate to contact us.

In addition, we provide a mailing list for free, which we use to inform regularly about updates on new qualification and research work as well as other information relating to Texttechnology.

Publications

2026

Sonja Hahn, Leon Hammerla, Corinna Hankeln, Sebastian Gross, Marie Steinke, Christina M. Röper Korf and Ulf Kroehne. April, 2026. Using Artificial Intelligence for Eliciting Diagnostic Evidence From Students’ Drawings: A Case Study From a Formative Mathematics Assessment. Psychological Test Adaptation and Development, 7:73–90.

BibTeX

@article{hahn:etal:2026,
  title     = {Using Artificial Intelligence for Eliciting Diagnostic Evidence
               From Students’ Drawings: A Case Study From a Formative Mathematics
               Assessment},
  volume    = {7},
  issn      = {2698-1866},
  url       = {http://dx.doi.org/10.1027/2698-1866/a000123},
  doi       = {10.1027/2698-1866/a000123},
  journal   = {Psychological Test Adaptation and Development},
  publisher = {Hogrefe Publishing Group},
  author    = {Hahn, Sonja and Hammerla, Leon and Hankeln, Corinna and Gross, Sebastian
               and Steinke, Marie and R\"{o}per Korf, Christina M. and Kroehne, Ulf},
  year      = {2026},
  month     = {apr},
  pages     = {73–90}
}

BibTeX

@inproceedings{Hammerla:Mehler:2026:a,
  title     = {{Gutenberg+}: A More Temporally Faithful Corpus for Diachronic {NLP}},
  author    = {Leon Hammerla and Alexander Mehler},
  booktitle = {Proceedings Workshop on Structured Linguistic Data and Evaluation
               (SLiDE 2026), co-located with the Language Resources and Evaluation
               Conference (LREC 2026)},
  address   = {Palma de Mallorca (Spain)},
  year      = {2026},
  keywords  = {neglab},
  note      = {accepted}
}

BibTeX

@inproceedings{Abusaleh:et:al:2026,
  title     = {Learning to Detect Cross-Modal Negation: An Analysis of Latent
               Representations and an Attention-Based Solution},
  author    = {Abusaleh, Ali and Hammerla, Leon and Mehler, Alexander},
  booktitle = {2026 8th International Conference on Natural Language Processing (ICNLP)},
  eventdate = {2026-03-20/2026-03-22},
  location  = {Xi'an,China},
  year      = {2026},
  keywords  = {Vision language model, Natural language processing, Cross-modal retrieval, negation detection, video analysis, Multimodal analysis, Political Communication, neglab, new-data-spaces, circlet},
  abstract  = {Detecting high-level semantic concepts like negation across modalities
               remains a challenge for current multimodal systems. We analyze
               this as a fundamental representation learning problem, providing
               the first evidence that negation does not form a linearly or non-linearly
               separable class in the latent spaces of standard vision-language
               models (VLMs). We demonstrate that pretrained embeddings primarily
               encode modality-specific features, lacking a generalizable negation
               signal. To overcome this, we propose a novel cross-modal attention
               architecture that explicitly models inter-modal dependencies,
               achieving performance gains of up to +7.03% F1 over unimodal baselines.
               Our analysis reveals a key asymmetry: while textual negation often
               appears independently, visual negation is semantically dependent
               on linguistic context, a finding validated through our statistical
               analysis of 3,222 political video-text pairs automatically annotated
               via Qwen2.5-VL. By combining this analysis with self-supervised
               video representations (JEPA2), we advance the modeling of temporal
               negation. This work provides new methods and insights for learning
               robust, semantically-aligned representations in multimodal systems.},
  note      = {accepted}
}

Longwei Cong, Leon Hammerla, Sonja Hahn, Sebastian Gombert, Hendrik Drachsler and Ulf Kröhne. 2026. Automatic Short Answer Grading with LLMs: From Memorization to Reasoning. Proceedings of the 16th International Learning Analytics & Knowledge Conference (LAK26). accepted.

BibTeX

@inproceedings{Cong:et:al:2026a,
  author    = {Cong, Longwei and Hammerla, Leon and Hahn, Sonja and Gombert, Sebastian
               and Drachsler, Hendrik and Kr{\"o}hne, Ulf},
  title     = {Automatic Short Answer Grading with LLMs: From Memorization to Reasoning},
  booktitle = {Proceedings of the 16th International Learning Analytics \& Knowledge
               Conference (LAK26)},
  series    = {LAK26},
  year      = {2026},
  pubstate  = {forthcoming},
  location  = {Bergen, Norway},
  note      = {accepted},
  abstract  = {Short-answer questions provide valuable insights into students’
               understanding and cognitive processes for learning analytics.
               However, they are difficult to grade automatically as they require
               a high level of language comprehension. Automatic Short Answer
               Grading (ASAG) is therefore essential in large-scale educational
               settings. Recent work has applied encode-only pre-trained language
               models (PLMs), such as BERT, and generative large language models
               (LLMs) to ASAG. Although fine-tuned BERT-based models currently
               produce state-of-the-art results, they depend on substantial annotated
               datasets, which are frequently expensive and insufficient. This
               paper examines the performance of fine-tuning of several PLMs
               and LLMs for different dataset sizes and compares the results
               to those of prompt-based approaches. General-purpose and domain-specific
               models were fine-tuned on datasets ranging from 800 to 26,674
               student responses. Different prompt engineering strategies were
               tested including rubric-based prompts. Our results demonstrate
               that fine-tuned LLMs and rubric-based prompting can match or exceed
               the performance of BERT-based models. Rubric-based prompts with
               open-source model deliver comparable results without the need
               for annotation data or hardware-intensive training, while also
               mitigating data protection concerns. This work provides empirical
               evidence of the role of LLMs in ASAG and paves the way for future
               research into resource-efficient, interpretable and reasoning-driven
               grading.}
}

BibTeX

@inproceedings{Luecking:Hammerla:Mehler:2026,
  author    = {Lücking, Andy and Hammerla, Leon and Mehler, Alexander},
  title     = {Not every quantifier can be negated},
  booktitle = {Proceedings of \textit{Sinn und Bedeutung}, Special Session ``Philosophical
               and Linguistic Approaches to Negation (PhilLingNeg)''},
  series    = {SuB'30},
  location  = {Frankfurt am Main},
  year      = {2026},
  pubstate  = {forthcoming},
  keywords  = {neglab},
  note      = {accepted}
}

2025

BibTeX

@inproceedings{Hammerla:et:al:2025b,
  author    = {Hammerla, Leon and Lücking, Andy and Reinert, Carolin and Mehler, Alexander},
  title     = {{D}-Neg: Syntax-Aware Graph Reasoning for Negation Detection},
  editor    = {Inui, Kentaro and Sakti, Sakriani and Wang, Haofen and Wong, Derek F.
               and Bhattacharyya, Pushpak and Banerjee, Biplab and Ekbal, Asif and Chakraborty, Tanmoy
               and Singh, Dhirendra Pratap},
  booktitle = {Proceedings of the 14th International Joint Conference on Natural
               Language Processing and the 4th Conference of the Asia-Pacific
               Chapter of the Association for Computational Linguistics},
  month     = {dec},
  year      = {2025},
  address   = {Mumbai, India},
  publisher = {The Asian Federation of Natural Language Processing and The Association for Computational Linguistics},
  url       = {https://aclanthology.org/2025.findings-ijcnlp.89/},
  pages     = {1432--1454},
  isbn      = {979-8-89176-303-6},
  abstract  = {Despite the communicative importance of negation, its detection
               remains challenging. Previous approaches perform poorly in out-of-domain
               scenarios, and progress outside of English has been slow due to
               a lack of resources and robust models. To address this gap, we
               present D-Neg: a syntax-aware graph reasoning model based on a
               transformer that incorporates syntactic embeddings by attention-gating.
               D-Neg uses graph attention to represent syntactic structures,
               emulating the effectiveness of rule-based dependency approaches
               for negation detection. We train D-Neg using 7 English resources
               and their translations into 10 languages, all aligned at the annotation
               level. We conduct an evaluation of all these datasets in in-domain
               and out-of-domain settings. Our work represents a significant
               advance in negation detection, enabling more effective cross-lingual
               research.},
  keywords  = {neglab}
}

BibTeX

@inproceedings{Hammerla:et:al:2025a,
  author    = {Hammerla, Leon and Mehler, Alexander and Abrami, Giuseppe},
  title     = {Standardizing Heterogeneous Corpora with {DUUR}: A Dual Data-
               and Process-Oriented Approach to Enhancing NLP Pipeline Integration},
  editor    = {Inui, Kentaro and Sakti, Sakriani and Wang, Haofen and Wong, Derek F.
               and Bhattacharyya, Pushpak and Banerjee, Biplab and Ekbal, Asif and Chakraborty, Tanmoy
               and Singh, Dhirendra Pratap},
  booktitle = {Proceedings of the 14th International Joint Conference on Natural
               Language Processing and the 4th Conference of the Asia-Pacific
               Chapter of the Association for Computational Linguistics},
  month     = {dec},
  year      = {2025},
  address   = {Mumbai, India},
  publisher = {The Asian Federation of Natural Language Processing and The Association for Computational Linguistics},
  url       = {https://aclanthology.org/2025.findings-ijcnlp.87/},
  pages     = {1410--1425},
  isbn      = {979-8-89176-303-6},
  abstract  = {Despite their success, LLMs are too computationally expensive
               to replace task- or domain-specific NLP systems. However, the
               variety of corpus formats makes reusing these systems difficult.
               This underscores the importance of maintaining an interoperable
               NLP landscape. We address this challenge by pursuing two objectives:
               standardizing corpus formats and enabling massively parallel corpus
               processing. We present a unified conversion framework embedded
               in a massively parallel, microservice-based, programming language-independent
               NLP architecture designed for modularity and extensibility. It
               allows for the integration of external NLP conversion tools and
               supports the addition of new components that meet basic compatibility
               requirements. To evaluate our dual data- and process-oriented
               approach to standardization, we (1) benchmark its efficiency in
               terms of processing speed and memory usage, (2) demonstrate the
               benefits of standardized corpus formats for NLP downstream tasks,
               and (3) illustrate the advantages of incorporating custom formats
               into a corpus format ecosystem.},
  keywords  = {neglab,duui}
}

Sonja Hahn, Leon Hammerla, Corinna Hankeln, Sebastian Groß, Christina Röpers and Ulf Kröhne. 2025. Constructed Responses beyond NLP – Auswertungsansätze für graphische Antworten. Inproceedings of 12. Jahrestagung der Gesellschaft für empirische Bildungsforschung (GEBF 2025).

BibTeX

@inproceedings{Hahn:et:al:2025,
  author    = {Sonja Hahn and Leon Hammerla and Corinna Hankeln and Sebastian Groß
               and Christina Röpers and Ulf Kröhne},
  title     = {Constructed Responses beyond NLP – Auswertungsansätze für graphische Antworten},
  booktitle = {Inproceedings of 12. Jahrestagung der Gesellschaft für empirische
               Bildungsforschung (GEBF 2025)},
  location  = {Mannheim, Deutschland},
  year      = {2025}
}

2024

Ulf Kröhne, Leon Hammerla, Corinna Hankeln, Marc Müller and Sonja Hahn. 2024. How much training data are required? Automatic scoring using prompting compared to text classification tasks as fine-tuning large-language models. Inproceedings of 53. Kongress der Deutschen Gesellschaft für Psychologie / 15. ÖGP Conference.

BibTeX

@inproceedings{Kroehne:et:al:2024,
  author    = {Ulf Kröhne and Leon Hammerla and Corinna Hankeln and Marc Müller and Sonja Hahn},
  title     = {How much training data are required? Automatic scoring using prompting
               compared to text classification tasks as fine-tuning large-language
               models},
  booktitle = {Inproceedings of 53. Kongress der Deutschen Gesellschaft für Psychologie
               / 15. ÖGP Conference},
  location  = {Wien, Österreich},
  year      = {2024}
}

Andy Lücking, Giuseppe Abrami, Leon Hammerla, Marc Rahn, Daniel Baumartz, Steffen Eger and Alexander Mehler. May, 2024. Dependencies over Times and Tools (DoTT). Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), 4641–4653.

BibTeX

@inproceedings{Luecking:et:al:2024,
  abstract  = {Purpose: Based on the examples of English and German, we investigate
               to what extent parsers trained on modern variants of these languages
               can be transferred to older language levels without loss. Methods:
               We developed a treebank called DoTT (https://github.com/texttechnologylab/DoTT)
               which covers, roughly, the time period from 1800 until today,
               in conjunction with the further development of the annotation
               tool DependencyAnnotator. DoTT consists of a collection of diachronic
               corpora enriched with dependency annotations using 3 parsers,
               6 pre-trained language models, 5 newly trained models for German,
               and two tag sets (TIGER and Universal Dependencies). To assess
               how the different parsers perform on texts from different time
               periods, we created a gold standard sample as a benchmark. Results:
               We found that the parsers/models perform quite well on modern
               texts (document-level LAS ranging from 82.89 to 88.54) and slightly
               worse on older texts, as expected (average document-level LAS
               84.60 vs. 86.14), but not significantly. For German texts, the
               (German) TIGER scheme achieved slightly better results than UD.
               Conclusion: Overall, this result speaks for the transferability
               of parsers to past language levels, at least dating back until
               around 1800. This very transferability, it is however argued,
               means that studies of language change in the field of dependency
               syntax can draw on dependency distance but miss out on some grammatical
               phenomena.},
  address   = {Torino, Italy},
  author    = {L{\"u}cking, Andy and Abrami, Giuseppe and Hammerla, Leon and Rahn, Marc
               and Baumartz, Daniel and Eger, Steffen and Mehler, Alexander},
  booktitle = {Proceedings of the 2024 Joint International Conference on Computational
               Linguistics, Language Resources and Evaluation (LREC-COLING 2024)},
  editor    = {Calzolari, Nicoletta and Kan, Min-Yen and Hoste, Veronique and Lenci, Alessandro
               and Sakti, Sakriani and Xue, Nianwen},
  month     = {may},
  pages     = {4641--4653},
  publisher = {ELRA and ICCL},
  title     = {Dependencies over Times and Tools ({D}o{TT})},
  url       = {https://aclanthology.org/2024.lrec-main.415},
  poster    = {https://www.texttechnologylab.org/wp-content/uploads/2024/05/LREC_2024_Poster_DoTT.pdf},
  year      = {2024}
}

2022

Giuseppe Abrami, Mevlüt Bagci, Leon Hammerla and Alexander Mehler. 2022. German Parliamentary Corpus (GerParCor). Proceedings of the Language Resources and Evaluation Conference, 1900–1906.

BibTeX

@inproceedings{Abrami:Bagci:Hammerla:Mehler:2022,
  author    = {Abrami, Giuseppe and Bagci, Mevlüt and Hammerla, Leon and Mehler, Alexander},
  editor    = {Calzolari, Nicoletta and B\'echet, Fr\'ed\'eric and Blache, Philippe
               and Choukri, Khalid and Cieri, Christopher and Declerck, Thierry and Goggi, Sara
               and Isahara, Hitoshi and Maegaard, Bente and Mariani, Joseph and Mazo, H\'el\`ene
               and Odijk, Jan and Piperidis, Stelios},
  title     = {German Parliamentary Corpus (GerParCor)},
  booktitle = {Proceedings of the Language Resources and Evaluation Conference},
  year      = {2022},
  address   = {Marseille, France},
  publisher = {European Language Resources Association},
  pages     = {1900--1906},
  abstract  = {Parliamentary debates represent a large and partly unexploited
               treasure trove of publicly accessible texts. In the German-speaking
               area, there is a certain deficit of uniformly accessible and annotated
               corpora covering all German-speaking parliaments at the national
               and federal level. To address this gap, we introduce the German
               Parliamentary Corpus (GerParCor). GerParCor is a genre-specific
               corpus of (predominantly historical) German-language parliamentary
               protocols from three centuries and four countries, including state
               and federal level data. In addition, GerParCor contains conversions
               of scanned protocols and, in particular, of protocols in Fraktur
               converted via an OCR process based on Tesseract. All protocols
               were preprocessed by means of the NLP pipeline of spaCy3 and automatically
               annotated with metadata regarding their session date. GerParCor
               is made available in the XMI format of the UIMA project. In this
               way, GerParCor can be used as a large corpus of historical texts
               in the field of political communication for various tasks in NLP.},
  url       = {https://aclanthology.org/2022.lrec-1.202},
  poster    = {https://www.texttechnologylab.org/wp-content/uploads/2022/06/GerParCor_LREC_2022.pdf},
  keywords  = {gerparcor},
  pdf       = {http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.202.pdf}
}