Publications
2024
2024.
On German verb sense disambiguation: A three-part approach based
on linking a sense inventory (GermaNet) to a corpus through
annotation (TGVCorp) and its use for training a VSD classifier
(TTvSense). Journal of Language Modelling.
Forthcoming.
BibTeX
@article{Mattern:et:al:2024-vsd,
author = {Mattern, Dominik and Hemati, Wahed and Lücking, Andy and Mehler, Alexander},
author+an = {3=highlight},
keywords = {own,article},
title = {On German verb sense disambiguation: {A} three-part approach based
on linking a sense inventory ({GermaNet}) to a corpus through
annotation ({TGVCorp}) and its use for training a {VSD} classifier
({TTvSense})},
year = {2024},
journal = {Journal of Language Modelling},
pubstate = {forthcoming},
note = {Forthcoming}
}
Sep., 2024.
On German verb sense disambiguation: A three-part approach based
on linking a sense inventory (GermaNet) to a corpus through annotation
(TGVCorp) and using the corpus to train a VSD classifier (TTvSense). Journal of Language Modelling, 12(1):155–212.
BibTeX
@article{Mattern:Hemati:Lücking:Mehler:2024,
author = {Mattern, Dominik and Hemati, Wahed and Lücking, Andy and Mehler, Alexander},
title = {On German verb sense disambiguation: A three-part approach based
on linking a sense inventory (GermaNet) to a corpus through annotation
(TGVCorp) and using the corpus to train a VSD classifier (TTvSense)},
abstractnote = {We develop a three-part approach to Verb Sense Disambiguation (VSD) in German. After considering a set of lexical resources and corpora, we arrive at a statistically motivated selection of a subset of verbs and their senses from GermaNet. This sub-inventory is then used to disambiguate the occurrences of the corresponding verbs in a corpus resulting from the union of TüBa-D/Z, Salsa, and E-VALBU. The corpus annotated in this way is called TGVCorp. It is used in the third part of the paper for training a classifier for VSD and for its comparative evaluation with a state-of-the-art approach in this research area, namely EWISER. Our simple classifier outperforms the transformer-based approach on the same data in both accuracy and speed in German but not in English and we discuss possible reasons.},
journal = {Journal of Language Modelling},
volume = {12},
number = {1},
year = {2024},
month = {Sep.},
pages = {155–212},
url = {https://jlm.ipipan.waw.pl/index.php/JLM/article/view/356}
}
2021
2021.
From distinguishability to informativity. A quantitative text
model for detecting random texts.. Language and Text: Data, models, information and applications, 356:145–162.
BibTeX
@article{Konca:et:al:2021,
title = {From distinguishability to informativity. A quantitative text
model for detecting random texts.},
author = {Konca, Maxim and Mehler, Alexander and Baumartz, Daniel and Hemati, Wahed},
journal = {Language and Text: Data, models, information and applications},
volume = {356},
pages = {145--162},
year = {2021},
editor = {Adam Paw{\l}owski, Jan Ma{\v{c}}utek, Sheila Embleton and George Mikros},
publisher = {John Benjamins Publishing Company},
doi = {10.1075/cilt.356.10kon}
}
2020
2020.
From Topic Networks to Distributed Cognitive Maps: Zipfian Topic
Universes in the Area of Volunteered Geographic Information. Complexity, 4:1–47.
BibTeX
@article{Mehler:Gleim:Gaitsch:Uslu:Hemati:2020,
author = {Alexander Mehler and R{\"{u}}diger Gleim and Regina Gaitsch and Tolga Uslu
and Wahed Hemati},
title = {From Topic Networks to Distributed Cognitive Maps: {Zipfian} Topic
Universes in the Area of Volunteered Geographic Information},
journal = {Complexity},
volume = {4},
doi = {10.1155/2020/4607025},
pages = {1-47},
issuetitle = {Cognitive Network Science: A New Frontier},
year = {2020}
}
May, 2020.
Recognizing Sentence-level Logical Document Structures with the
Help of Context-free Grammars. Proceedings of The 12th Language Resources and Evaluation Conference, 5282–5290.
BibTeX
@inproceedings{Hildebrand:Hemati:Mehler:2020,
author = {Hildebrand, Jonathan and Hemati, Wahed and Mehler, Alexander},
title = {Recognizing Sentence-level Logical Document Structures with the
Help of Context-free Grammars},
booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference},
month = {May},
year = {2020},
address = {Marseille, France},
publisher = {European Language Resources Association},
pages = {5282--5290},
abstract = {Current sentence boundary detectors split documents into sequentially
ordered sentences by detecting their beginnings and ends. Sentences,
however, are more deeply structured even on this side of constituent
and dependency structure: they can consist of a main sentence
and several subordinate clauses as well as further segments (e.g.
inserts in parentheses); they can even recursively embed whole
sentences and then contain multiple sentence beginnings and ends.
In this paper, we introduce a tool that segments sentences into
tree structures to detect this type of recursive structure. To
this end, we retrain different constituency parsers with the help
of modified training data to transform them into sentence segmenters.
With these segmenters, documents are mapped to sequences of sentence-related
“logical document structures”. The resulting segmenters aim to
improve downstream tasks by providing additional structural information.
In this context, we experiment with German dependency parsing.
We show that for certain sentence categories, which can be determined
automatically, improvements in German dependency parsing can be
achieved using our segmenter for preprocessing. The assumption
suggests that improvements in other languages and tasks can be
achieved.},
url = {https://www.aclweb.org/anthology/2020.lrec-1.650},
pdf = {http://www.lrec-conf.org/proceedings/lrec2020/pdf/2020.lrec-1.650.pdf}
}
2020.
The Frankfurt Latin Lexicon. From Morphological Expansion and
Word Embeddings to SemioGraphs. Studi e Saggi Linguistici, 58(1):121–155.
BibTeX
@article{Mehler:et:al:2020b,
author = {Mehler, Alexander and Jussen, Bernhard and Geelhaar, Tim and Henlein, Alexander
and Abrami, Giuseppe and Baumartz, Daniel and Uslu, Tolga and Hemati, Wahed},
title = {{The Frankfurt Latin Lexicon. From Morphological Expansion and
Word Embeddings to SemioGraphs}},
journal = {Studi e Saggi Linguistici},
doi = {10.4454/ssl.v58i1.276},
year = {2020},
volume = {58},
number = {1},
pages = {121--155},
abstract = {In this article we present the Frankfurt Latin Lexicon (FLL),
a lexical resource for Medieval Latin that is used both for the
lemmatization of Latin texts and for the post-editing of lemmatizations.
We describe recent advances in the development of lemmatizers
and test them against the Capitularies corpus (comprising Frankish
royal edicts, mid-6th to mid-9th century), a corpus created as
a reference for processing Medieval Latin. We also consider the
post-correction of lemmatizations using a limited crowdsourcing
process aimed at continuous review and updating of the FLL. Starting
from the texts resulting from this lemmatization process, we describe
the extension of the FLL by means of word embeddings, whose interactive
traversing by means of SemioGraphs completes the digital enhanced
hermeneutic circle. In this way, the article argues for a more
comprehensive understanding of lemmatization, encompassing classical
machine learning as well as intellectual post-corrections and,
in particular, human computation in the form of interpretation
processes based on graph representations of the underlying lexical
resources.},
url = {https://www.studiesaggilinguistici.it/index.php/ssl/article/view/276},
pdf = {https://www.studiesaggilinguistici.it/index.php/ssl/article/download/276/219}
}
May, 2020.
Voting for POS tagging of Latin texts: Using the flair of FLAIR
to better Ensemble Classifiers by Example of Latin. Proceedings of LT4HALA 2020 - 1st Workshop on Language Technologies
for Historical and Ancient Languages, 130–135.
BibTeX
@inproceedings{Stoeckel:et:al:2020,
author = {Stoeckel, Manuel and Henlein, Alexander and Hemati, Wahed and Mehler, Alexander},
title = {{Voting for POS tagging of Latin texts: Using the flair of FLAIR
to better Ensemble Classifiers by Example of Latin}},
booktitle = {Proceedings of LT4HALA 2020 - 1st Workshop on Language Technologies
for Historical and Ancient Languages},
month = {May},
year = {2020},
address = {Marseille, France},
publisher = {European Language Resources Association (ELRA)},
pages = {130--135},
abstract = {Despite the great importance of the Latin language in the past,
there are relatively few resources available today to develop
modern NLP tools for this language. Therefore, the EvaLatin Shared
Task for Lemmatization and Part-of-Speech (POS) tagging was published
in the LT4HALA workshop. In our work, we dealt with the second
EvaLatin task, that is, POS tagging. Since most of the available
Latin word embeddings were trained on either few or inaccurate
data, we trained several embeddings on better data in the first
step. Based on these embeddings, we trained several state-of-the-art
taggers and used them as input for an ensemble classifier called
LSTMVoter. We were able to achieve the best results for both the
cross-genre and the cross-time task (90.64\% and 87.00\%) without
using additional annotated data (closed modality). In the meantime,
we further improved the system and achieved even better results
(96.91\% on classical, 90.87\% on cross-genre and 87.35\% on cross-time).},
url = {https://www.aclweb.org/anthology/2020.lt4hala-1.21},
pdf = {http://www.lrec-conf.org/proceedings/lrec2020/workshops/LT4HALA/pdf/2020.lt4hala-1.21.pdf}
}
2020.
Fast and Easy Access to Central European Biodiversity Data with BIOfid. Biodiversity Information Science and Standards, 4:e59157.
BibTeX
@article{Driller:et:al:2020,
author = {Christine Driller and Markus Koch and Giuseppe Abrami and Wahed Hemati
and Andy Lücking and Alexander Mehler and Adrian Pachzelt and Gerwin Kasperek},
title = {Fast and Easy Access to Central European Biodiversity Data with BIOfid},
volume = {4},
number = {},
year = {2020},
doi = {10.3897/biss.4.59157},
publisher = {Pensoft Publishers},
abstract = {The storage of data in public repositories such as the Global
Biodiversity Information Facility (GBIF) or the National Center
for Biotechnology Information (NCBI) is nowadays stipulated in
the policies of many publishers in order to facilitate data replication
or proliferation. Species occurrence records contained in legacy
printed literature are no exception to this. The extent of their
digital and machine-readable availability, however, is still far
from matching the existing data volume (Thessen and Parr 2014).
But precisely these data are becoming more and more relevant to
the investigation of ongoing loss of biodiversity. In order to
extract species occurrence records at a larger scale from available
publications, one has to apply specialised text mining tools.
However, such tools are in short supply especially for scientific
literature in the German language.The Specialised Information
Service Biodiversity Research*1 BIOfid (Koch et al. 2017) aims
at reducing this desideratum, inter alia, by preparing a searchable
text corpus semantically enriched by a new kind of multi-label
annotation. For this purpose, we feed manual annotations into
automatic, machine-learning annotators. This mixture of automatic
and manual methods is needed, because BIOfid approaches a new
application area with respect to language (mainly German of the
19th century), text type (biological reports), and linguistic
focus (technical and everyday language).We will present current
results of the performance of BIOfid’s semantic search engine
and the application of independent natural language processing
(NLP) tools. Most of these are freely available online, such as
TextImager (Hemati et al. 2016). We will show how TextImager is
tied into the BIOfid pipeline and how it is made scalable (e.g.
extendible by further modules) and usable on different systems
(docker containers).Further, we will provide a short introduction
to generating machine-learning training data using TextAnnotator
(Abrami et al. 2019) for multi-label annotation. Annotation reproducibility
can be assessed by the implementation of inter-annotator agreement
methods (Abrami et al. 2020). Beyond taxon recognition and entity
linking, we place particular emphasis on location and time information.
For this purpose, our annotation tag-set combines general categories
and biology-specific categories (including taxonomic names) with
location and time ontologies. The application of the annotation
categories is regimented by annotation guidelines (Lücking et
al. 2020). Within the next years, our work deliverable will be
a semantically accessible and data-extractable text corpus of
around two million pages. In this way, BIOfid is creating a new
valuable resource that expands our knowledge of biodiversity and
its determinants.},
issn = {},
pages = {e59157},
url = {https://doi.org/10.3897/biss.4.59157},
eprint = {https://doi.org/10.3897/biss.4.59157},
journal = {Biodiversity Information Science and Standards},
keywords = {biofid}
}
2020.
Multiple Texts as a Limiting Factor in Online Learning: Quantifying
(Dis-)similarities of Knowledge Networks. Frontiers in Education, 5:206.
BibTeX
@article{Mehler:Hemati:Welke:Konca:Uslu:2020,
abstract = {We test the hypothesis that the extent to which one obtains information
on a given topic through Wikipedia depends on the language in
which it is consulted. Controlling the size factor, we investigate
this hypothesis for a number of 25 subject areas. Since Wikipedia
is a central part of the web-based information landscape, this
indicates a language-related, linguistic bias. The article therefore
deals with the question of whether Wikipedia exhibits this kind
of linguistic relativity or not. From the perspective of educational
science, the article develops a computational model of the information
landscape from which multiple texts are drawn as typical input
of web-based reading. For this purpose, it develops a hybrid model
of intra- and intertextual similarity of different parts of the
information landscape and tests this model on the example of 35
languages and corresponding Wikipedias. In the way it measures
the similarities of hypertexts, the article goes beyond existing
approaches by examining their structural and semantic aspects
intra- and intertextually. In this way it builds a bridge between
reading research, educational science, Wikipedia research and
computational linguistics.},
author = {Mehler, Alexander and Hemati, Wahed and Welke, Pascal and Konca, Maxim
and Uslu, Tolga},
doi = {10.3389/feduc.2020.562670},
issn = {2504-284X},
journal = {Frontiers in Education},
pages = {206},
title = {Multiple Texts as a Limiting Factor in Online Learning: Quantifying
(Dis-)similarities of Knowledge Networks},
url = {https://www.frontiersin.org/article/10.3389/feduc.2020.562670},
pdf = {https://www.frontiersin.org/articles/10.3389/feduc.2020.562670/pdf},
volume = {5},
year = {2020}
}
2020.
PhD Thesis: TextImager-VSD : large scale verb sense disambiguation and named
entity recognition in the context of TextImager.
BibTeX
@phdthesis{Hemati:2020,
author = {Wahed Hemati},
title = {TextImager-VSD : large scale verb sense disambiguation and named
entity recognition in the context of TextImager},
pages = {174},
year = {2020},
url = {http://publikationen.ub.uni-frankfurt.de/frontdoor/index/index/docId/56089},
pdf = {http://publikationen.ub.uni-frankfurt.de/files/56089/dissertation_Wahed_Hemati.pdf}
}
2019
2019.
A practitioner's view: a survey and comparison of lemmatization
and morphological tagging in German and Latin. Journal of Language Modeling.
BibTeX
@article{Gleim:Eger:Mehler:2019,
author = {Gleim, R\"{u}diger and Eger, Steffen and Mehler, Alexander and Uslu, Tolga
and Hemati, Wahed and L\"{u}cking, Andy and Henlein, Alexander and Kahlsdorf, Sven
and Hoenen, Armin},
title = {A practitioner's view: a survey and comparison of lemmatization
and morphological tagging in German and Latin},
journal = {Journal of Language Modeling},
year = {2019},
pdf = {https://www.texttechnologylab.org/wp-content/uploads/2019/07/jlm-tagging.pdf},
doi = {10.15398/jlm.v7i1.205},
url = {http://jlm.ipipan.waw.pl/index.php/JLM/article/view/205}
}
2019.
Der TextImager als Front- und Backend für das verteilte NLP von
Big Digital Humanities Data. Proceedings of the 6th Digital Humanities Conference in the German-speaking
Countries, DHd 2019.
BibTeX
@inproceedings{Hemati:Mehler:Uslu:Abrami:2019,
author = {Hemati, Wahed and Mehler, Alexander and Uslu, Tolga and Abrami, Giuseppe},
title = {{Der TextImager als Front- und Backend für das verteilte NLP von
Big Digital Humanities Data}},
booktitle = {Proceedings of the 6th Digital Humanities Conference in the German-speaking
Countries, DHd 2019},
series = {DHd 2019},
pdf = {https://www.texttechnologylab.org/wp-content/uploads/2019/04/Der-TextImager-als-Fron-und-Backend.pdf},
poster = {https://www.texttechnologylab.org/wp-content/uploads/2019/04/DHD19_TextImager.pdf},
location = {Frankfurt, Germany},
year = {2019}
}
January, 2019.
LSTMVoter: chemical named entity recognition using a conglomerate
of sequence labeling tools. Journal of Cheminformatics, 11(1):7.
BibTeX
@article{Hemati:Mehler:2019a,
abstract = {Chemical and biomedical named entity recognition (NER) is an essential
preprocessing task in natural language processing. The identification
and extraction of named entities from scientific articles is also
attracting increasing interest in many scientific disciplines.
Locating chemical named entities in the literature is an essential
step in chemical text mining pipelines for identifying chemical
mentions, their properties, and relations as discussed in the
literature. In this work, we describe an approach to the BioCreative
V.5 challenge regarding the recognition and classification of
chemical named entities. For this purpose, we transform the task
of NER into a sequence labeling problem. We present a series of
sequence labeling systems that we used, adapted and optimized
in our experiments for solving this task. To this end, we experiment
with hyperparameter optimization. Finally, we present LSTMVoter,
a two-stage application of recurrent neural networks that integrates
the optimized sequence labelers from our study into a single ensemble
classifier.},
author = {Hemati, Wahed and Mehler, Alexander},
day = {10},
doi = {10.1186/s13321-018-0327-2},
issn = {1758-2946},
journal = {Journal of Cheminformatics},
month = {Jan},
number = {1},
pages = {7},
title = {{{LSTMVoter}: chemical named entity recognition using a conglomerate
of sequence labeling tools}},
url = {https://doi.org/10.1186/s13321-018-0327-2},
volume = {11},
year = {2019}
}
March, 2019.
CRFVoter: gene and protein related object recognition using
a conglomerate of CRF-based tools. Journal of Cheminformatics, 11(1):11.
BibTeX
@article{Hemati:Mehler:2019b,
author = {Hemati, Wahed and Mehler, Alexander},
title = {{{CRFVoter}: gene and protein related object recognition using
a conglomerate of CRF-based tools}},
journal = {Journal of Cheminformatics},
year = {2019},
month = {Mar},
day = {14},
volume = {11},
number = {1},
pages = {11},
abstract = {Gene and protein related objects are an important class of entities
in biomedical research, whose identification and extraction from
scientific articles is attracting increasing interest. In this
work, we describe an approach to the BioCreative V.5 challenge
regarding the recognition and classification of gene and protein
related objects. For this purpose, we transform the task as posed
by BioCreative V.5 into a sequence labeling problem. We present
a series of sequence labeling systems that we used and adapted
in our experiments for solving this task. Our experiments show
how to optimize the hyperparameters of the classifiers involved.
To this end, we utilize various algorithms for hyperparameter
optimization. Finally, we present CRFVoter, a two-stage application
of Conditional Random Field (CRF) that integrates the optimized
sequence labelers from our study into one ensemble classifier.},
issn = {1758-2946},
doi = {10.1186/s13321-019-0343-x},
url = {https://doi.org/10.1186/s13321-019-0343-x}
}
2019.
Corpus2Wiki: A MediaWiki-based Tool for Automatically Generating
Wikiditions in Digital Humanities. INF-DH-2019.
BibTeX
@inproceedings{Hunziker:et:al:2019,
author = {Hunziker, Alex and Mammadov, Hasanagha and Hemati, Wahed and Mehler, Alexander},
title = {{Corpus2Wiki}: A MediaWiki-based Tool for Automatically Generating
Wikiditions in Digital Humanities},
booktitle = {INF-DH-2019},
year = {2019},
editor = {Burghardt, Manuel AND Müller-Birn, Claudia},
publisher = {Gesellschaft für Informatik e.V.},
address = {Bonn}
}
November, 2019.
When Specialization Helps: Using Pooled Contextualized Embeddings
to Detect Chemical and Biomedical Entities in Spanish. Proceedings of The 5th Workshop on BioNLP Open Shared Tasks, 11–15.
BibTeX
@inproceedings{Stoeckel:Hemati:Mehler:2019,
title = {When Specialization Helps: Using Pooled Contextualized Embeddings
to Detect Chemical and Biomedical Entities in {S}panish},
author = {Stoeckel, Manuel and Hemati, Wahed and Mehler, Alexander},
booktitle = {Proceedings of The 5th Workshop on BioNLP Open Shared Tasks},
month = {nov},
year = {2019},
address = {Hong Kong, China},
publisher = {Association for Computational Linguistics},
url = {https://www.aclweb.org/anthology/D19-5702},
doi = {10.18653/v1/D19-5702},
pages = {11--15},
abstract = {The recognition of pharmacological substances, compounds and proteins
is an essential preliminary work for the recognition of relations
between chemicals and other biomedically relevant units. In this
paper, we describe an approach to Task 1 of the PharmaCoNER Challenge,
which involves the recognition of mentions of chemicals and drugs
in Spanish medical texts. We train a state-of-the-art BiLSTM-CRF
sequence tagger with stacked Pooled Contextualized Embeddings,
word and sub-word embeddings using the open-source framework FLAIR.
We present a new corpus composed of articles and papers from Spanish
health science journals, termed the Spanish Health Corpus, and
use it to train domain-specific embeddings which we incorporate
in our model training. We achieve a result of 89.76{\%} F1-score
using pre-trained embeddings and are able to improve these results
to 90.52{\%} F1-score using specialized embeddings.}
}
2018
March, 2018.
Natural Language Processing and Text Mining for BIOfid.
BibTeX
@misc{Abrami:et:al:2018b,
author = {Abrami, Giuseppe and Ahmed, Sajawel and Gleim, R{\"u}diger and Hemati, Wahed
and Mehler, Alexander and Uslu Tolga},
title = {{Natural Language Processing and Text Mining for BIOfid}},
howpublished = {Presentation at the 1st Meeting of the Scientific Advisory Board of the BIOfid Project},
adress = {Goethe-University, Frankfurt am Main, Germany},
year = {2018},
month = {March},
day = {08},
pdf = {}
}
2018.
Integrating Computational Linguistic Analysis of Multilingual
Learning Data and Educational Measurement Approaches to Explore
Learning in Higher Education. In: Positive Learning in the Age of Information: A Blessing or a Curse?, 145–193.
Springer Fachmedien Wiesbaden.
BibTeX
@inbook{Mehler:et:al:2018,
abstract = {This chapter develops a computational linguistic model for analyzing
and comparing multilingual data as well as its application to
a large body of standardized assessment data from higher education.
The approach employs both an automatic and a manual annotation
of the data on several linguistic layers (including parts of speech,
text structure and content). Quantitative features of the textual
data are explored that are related to both the students' (domain-specific
knowledge) test results and their level of academic experience.
The respective analysis involves statistics of distance correlation,
text categorization with respect to text types (questions and
response options) as well as languages (English and German), and
network analysis to assess dependencies between features. The
correlation between correct test results of students and linguistic
features of the verbal presentations of tests indicate to what
extent language influences higher education test performance.
It has also been found that this influence relates to specialized
language. Thus, this integrative modeling approach contributes
a test basis for a large-scale analysis of learning data and points
to a number of subsequent, more detailed research questions.},
address = {Wiesbaden},
author = {Mehler, Alexander and Zlatkin-Troitschanskaia, Olga and Hemati, Wahed
and Molerov, Dimitri and L{\"u}cking, Andy and Schmidt, Susanne},
booktitle = {Positive Learning in the Age of Information: A Blessing or a Curse?},
doi = {10.1007/978-3-658-19567-0_10},
editor = {Zlatkin-Troitschanskaia, Olga and Wittum, Gabriel and Dengel, Andreas},
isbn = {978-3-658-19567-0},
pages = {145--193},
publisher = {Springer Fachmedien Wiesbaden},
title = {Integrating Computational Linguistic Analysis of Multilingual
Learning Data and Educational Measurement Approaches to Explore
Learning in Higher Education},
url = {https://doi.org/10.1007/978-3-658-19567-0_10},
year = {2018}
}
2018.
fastSense: An Efficient Word Sense Disambiguation Classifier. Proceedings of the 11th edition of the Language Resources and
Evaluation Conference, May 7 - 12.
BibTeX
@inproceedings{Uslu:et:al:2018,
author = {Tolga Uslu and Alexander Mehler and Daniel Baumartz and Alexander Henlein
and Wahed Hemati},
title = {fastSense: An Efficient Word Sense Disambiguation Classifier},
booktitle = {Proceedings of the 11th edition of the Language Resources and
Evaluation Conference, May 7 - 12},
series = {LREC 2018},
address = {Miyazaki, Japan},
pdf = {https://www.texttechnologylab.org/wp-content/uploads/2018/03/fastSense.pdf},
year = {2018}
}
2018.
Automatic Classification in Memory Clinic Patients and in Depressive Patients. Proceedings of Resources and ProcessIng of linguistic, para-linguistic
and extra-linguistic Data from people with various forms of cognitive/psychiatric
impairments (RaPID-2).
BibTeX
@inproceedings{Uslu:et:al:2018:a,
author = {Tolga Uslu and Lisa Miebach and Steffen Wolfsgruber and Michael Wagner
and Klaus Fließbach and Rüdiger Gleim and Wahed Hemati and Alexander Henlein
and Alexander Mehler},
title = {{Automatic Classification in Memory Clinic Patients and in Depressive Patients}},
booktitle = {Proceedings of Resources and ProcessIng of linguistic, para-linguistic
and extra-linguistic Data from people with various forms of cognitive/psychiatric
impairments (RaPID-2)},
series = {RaPID},
location = {Miyazaki, Japan},
year = {2018}
}
2018.
A Multidimensional Model of Syntactic Dependency Trees for Authorship
Attribution. Quantitative analysis of dependency structures.
BibTeX
@incollection{Mehler:Hemati:Uslu:Luecking:2018,
author = {Alexander Mehler and Wahed Hemati and Tolga Uslu and Andy Lücking},
title = {A Multidimensional Model of Syntactic Dependency Trees for Authorship
Attribution},
booktitle = {Quantitative analysis of dependency structures},
publisher = {De Gruyter},
editor = {Jingyang Jiang and Haitao Liu},
address = {Berlin/New York},
abstract = {Abstract: In this chapter we introduce a multidimensional model
of syntactic dependency trees. Our ultimate goal is to generate
fingerprints of such trees to predict the author of the underlying
sentences. The chapter makes a first attempt to create such fingerprints
for sentence categorization via the detour of text categorization.
We show that at text level, aggregated dependency structures actually
provide information about authorship. At the same time, we show
that this does not hold for topic detection. We evaluate our model
using a quarter of a million sentences collected in two corpora:
the first is sampled from literary texts, the second from Wikipedia
articles. As a second finding of our approach, we show that quantitative
models of dependency structure do not yet allow for detecting
syntactic alignment in written communication. We conclude that
this is mainly due to effects of lexical alignment on syntactic
alignment.},
keywords = {Dependency structure, Authorship attribution, Text
categorization, Syntactic Alignment},
year = {2018}
}
2018.
VienNA: Auf dem Weg zu einer Infrastruktur für die verteilte
interaktive evolutionäre Verarbeitung natürlicher Sprache. Forschungsinfrastrukturen und digitale Informationssysteme in
der germanistischen Sprachwissenschaft, 6.
BibTeX
@incollection{Mehler:Hemati:Gleim:Baumartz:2018,
author = {Alexander Mehler and Wahed Hemati and Rüdiger Gleim and Daniel Baumartz},
title = {{VienNA: }{Auf dem Weg zu einer Infrastruktur für die verteilte
interaktive evolutionäre Verarbeitung natürlicher Sprache}},
booktitle = {Forschungsinfrastrukturen und digitale Informationssysteme in
der germanistischen Sprachwissenschaft},
publisher = {De Gruyter},
editor = {Henning Lobin and Roman Schneider and Andreas Witt},
volume = {6},
address = {Berlin},
year = {2018}
}
2018.
Evaluating and Integrating Databases in the Area of NLP. International Quantitative Linguistics Conference (QUALICO 2018).
BibTeX
@inproceedings{Hemati:Mehler:Uslu:Baumartz:Abrami:2018,
author = {Wahed Hemati and Alexander Mehler and Tolga Uslu and Daniel Baumartz
and Giuseppe Abrami},
title = {Evaluating and Integrating Databases in the Area of {NLP}},
booktitle = {International Quantitative Linguistics Conference (QUALICO 2018)},
year = {2018},
pdf = {https://www.texttechnologylab.org/wp-content/uploads/2018/04/Hemat-Mehler-Uslu-Baumartz-Abrami-Qualico-2018.pdf},
poster = {https://www.texttechnologylab.org/wp-content/uploads/2018/10/qualico2018_databases_poster_hemati_mehler_uslu_baumartz_abrami.pdf},
location = {Wroclaw, Poland}
}
2018.
Workflow and Current Achievements of BIOfid, an Information Service
Mobilizing Biodiversity Data from Literature Sources. Biodiversity Information Science and Standards, 2:e25876.
BibTeX
@article{Driller:et:al:2018,
author = {Christine Driller and Markus Koch and Marco Schmidt and Claus Weiland
and Thomas Hörnschemeyer and Thomas Hickler and Giuseppe Abrami and Sajawel Ahmed
and Rüdiger Gleim and Wahed Hemati and Tolga Uslu and Alexander Mehler
and Adrian Pachzelt and Jashar Rexhepi and Thomas Risse and Janina Schuster
and Gerwin Kasperek and Angela Hausinger},
title = {Workflow and Current Achievements of BIOfid, an Information Service
Mobilizing Biodiversity Data from Literature Sources},
volume = {2},
number = {},
year = {2018},
doi = {10.3897/biss.2.25876},
publisher = {Pensoft Publishers},
abstract = {BIOfid is a specialized information service currently being developed
to mobilize biodiversity data dormant in printed historical and
modern literature and to offer a platform for open access journals
on the science of biodiversity. Our team of librarians, computer
scientists and biologists produce high-quality text digitizations,
develop new text-mining tools and generate detailed ontologies
enabling semantic text analysis and semantic search by means of
user-specific queries. In a pilot project we focus on German publications
on the distribution and ecology of vascular plants, birds, moths
and butterflies extending back to the Linnaeus period about 250
years ago. The three organism groups have been selected according
to current demands of the relevant research community in Germany.
The text corpus defined for this purpose comprises over 400 volumes
with more than 100,000 pages to be digitized and will be complemented
by journals from other digitization projects, copyright-free and
project-related literature. With TextImager (Natural Language
Processing & Text Visualization) and TextAnnotator (Discourse
Semantic Annotation) we have already extended and launched tools
that focus on the text-analytical section of our project. Furthermore,
taxonomic and anatomical ontologies elaborated by us for the taxa
prioritized by the project’s target group - German institutions
and scientists active in biodiversity research - are constantly
improved and expanded to maximize scientific data output. Our
poster describes the general workflow of our project ranging from
literature acquisition via software development, to data availability
on the BIOfid web portal (http://biofid.de/), and the implementation
into existing platforms which serve to promote global accessibility
of biodiversity data.},
issn = {},
pages = {e25876},
url = {https://doi.org/10.3897/biss.2.25876},
eprint = {https://doi.org/10.3897/biss.2.25876},
journal = {Biodiversity Information Science and Standards},
keywords = {biofid}
}
2018.
Corpus2Wiki: A MediaWiki based Annotation & Visualisation Tool
for the Digital Humanities. INF-DH-2018.
BibTeX
@inproceedings{Rutherford:et:al:2018,
author = {Rutherford, Eleanor AND Hemati, Wahed AND Mehler, Alexander},
title = {{Corpus2Wiki}: A MediaWiki based Annotation \& Visualisation Tool
for the Digital Humanities},
booktitle = {INF-DH-2018},
year = {2018},
editor = {Burghardt, Manuel AND Müller-Birn, Claudia},
publisher = {Gesellschaft für Informatik e.V.},
address = {Bonn}
}
2017
2017.
TextImager as a Generic Interface to R. Software Demonstrations of the 15th Conference of the European
Chapter of the Association for Computational Linguistics (EACL
2017).
BibTeX
@inproceedings{Uslu:Hemati:Mehler:Baumartz:2017,
author = {Tolga Uslu and Wahed Hemati and Alexander Mehler and Daniel Baumartz},
title = {{TextImager} as a Generic Interface to {R}},
booktitle = {Software Demonstrations of the 15th Conference of the European
Chapter of the Association for Computational Linguistics (EACL
2017)},
location = {Valencia, Spain},
pdf = {https://www.texttechnologylab.org/wp-content/uploads/2018/03/TextImager.pdf},
year = {2017}
}
2017.
Skalenfreie online soziale Lexika am Beispiel von Wiktionary. Proceedings of 53rd Annual Conference of the Institut für Deutsche
Sprache (IDS), March 14-16, Mannheim, Germany.
In German. Title translates into: Scale-free
online-social Lexika by Example of Wiktionary.
BibTeX
@inproceedings{Mehler:Gleim:Hemati:Uslu:2017,
author = {Alexander Mehler and Rüdiger Gleim and Wahed Hemati and Tolga Uslu},
title = {{Skalenfreie online soziale Lexika am Beispiel von Wiktionary}},
booktitle = {Proceedings of 53rd Annual Conference of the Institut für Deutsche
Sprache (IDS), March 14-16, Mannheim, Germany},
editor = {Stefan Engelberg and Henning Lobin and Kathrin Steyer and Sascha Wolfer},
address = {Berlin},
publisher = {De Gruyter},
note = {In German. Title translates into: Scale-free
online-social Lexika by Example of Wiktionary},
abstract = {In English: The paper deals with characteristics of the structural,
thematic and participatory dynamics of collaboratively generated
lexical networks. This is done by example of Wiktionary. Starting
from a network-theoretical model in terms of so-called multi-layer
networks, we describe Wiktionary as a scale-free lexicon. Systems
of this sort are characterized by the fact that their content-related
dynamics is determined by the underlying dynamics of collaborating
authors. This happens in a way that social structure imprints
on content structure. According to this conception, the unequal
distribution of the activities of authors results in a correspondingly
unequal distribution of the information units documented within
the lexicon. The paper focuses on foundations for describing such
systems starting from a parameter space which requires to deal
with Wiktionary as an issue in big data analysis. In German: Der
Beitrag thematisiert Eigenschaften der strukturellen, thematischen
und partizipativen Dynamik kollaborativ erzeugter lexikalischer
Netzwerke am Beispiel von Wiktionary. Ausgehend von einem netzwerktheoretischen
Modell in Form so genannter Mehrebenennetzwerke wird Wiktionary
als ein skalenfreies Lexikon beschrieben. Systeme dieser Art zeichnen
sich dadurch aus, dass ihre inhaltliche Dynamik durch die zugrundeliegende
Kollaborationsdynamik bestimmt wird, und zwar so, dass sich die
soziale Struktur der entsprechenden inhaltlichen Struktur aufprägt.
Dieser Auffassung gemäß führt die Ungleichverteilung der Aktivitäten
von Lexikonproduzenten zu einer analogen Ungleichverteilung der
im Lexikon dokumentierten Informationseinheiten. Der Beitrag thematisiert
Grundlagen zur Beschreibung solcher Systeme ausgehend von einem
Parameterraum, welcher die netzwerkanalytische Betrachtung von
Wiktionary als Big-Data-Problem darstellt.},
year = {2017}
}
2017.
TextImager as an interface to BeCalm. BioCreative V.5. Proceedings.
BibTeX
@inproceedings{Hemati:Uslu:Mehler:2017,
author = {Wahed Hemati and Tolga Uslu and Alexander Mehler},
title = {{TextImager} as an interface to {BeCalm}},
booktitle = {BioCreative V.5. Proceedings},
pdf = {https://www.texttechnologylab.org/wp-content/uploads/2018/03/TextImager_BeCalm.pdf},
year = {2017}
}
2017.
CRFVoter: Chemical Entity Mention, Gene and Protein Related
Object recognition using a conglomerate of CRF based tools. BioCreative V.5. Proceedings.
BibTeX
@inproceedings{Hemati:Mehler:Uslu:2017,
author = {Wahed Hemati and Alexander Mehler and Tolga Uslu},
title = {{CRFVoter}: Chemical Entity Mention, Gene and Protein Related
Object recognition using a conglomerate of CRF based tools},
booktitle = {BioCreative V.5. Proceedings},
pdf = {https://www.texttechnologylab.org/wp-content/uploads/2018/03/CRFVoter.pdf},
year = {2017}
}
2016
2016.
Wikidition: Automatic Lexiconization and Linkification of Text Corpora. Information Technology, 58:70–79.
BibTeX
@article{Mehler:et:al:2016,
author = {Alexander Mehler and Rüdiger Gleim and Tim vor der Brück and Wahed Hemati
and Tolga Uslu and Steffen Eger},
title = {Wikidition: Automatic Lexiconization and Linkification of Text Corpora},
journal = {Information Technology},
volume = {58},
pages = {70-79},
abstract = {We introduce a new text technology, called Wikidition, which automatically
generates large scale editions of corpora of natural language
texts. Wikidition combines a wide range of text mining tools for
automatically linking lexical, sentential and textual units. This
includes the extraction of corpus-specific lexica down to the
level of syntactic words and their grammatical categories. To
this end, we introduce a novel measure of text reuse and exemplify
Wikidition by means of the capitularies, that is, a corpus of
Medieval Latin texts.},
doi = {10.1515/itit-2015-0035},
year = {2016}
}
2016.
Text2voronoi: An Image-driven Approach to Differential Diagnosis. Proceedings of the 5th Workshop on Vision and Language (VL'16)
hosted by the 54th Annual Meeting of the Association for Computational
Linguistics (ACL), Berlin.
BibTeX
@inproceedings{Mehler:Uslu:Hemati:2016,
author = {Alexander Mehler and Tolga Uslu and Wahed Hemati},
title = {Text2voronoi: An Image-driven Approach to Differential Diagnosis},
booktitle = {Proceedings of the 5th Workshop on Vision and Language (VL'16)
hosted by the 54th Annual Meeting of the Association for Computational
Linguistics (ACL), Berlin},
pdf = {https://aclweb.org/anthology/W/W16/W16-3212.pdf},
year = {2016}
}
2016.
TextImager: a Distributed UIMA-based System for NLP. Proceedings of the COLING 2016 System Demonstrations.
BibTeX
@inproceedings{Hemati:Uslu:Mehler:2016,
author = {Wahed Hemati and Tolga Uslu and Alexander Mehler},
title = {TextImager: a Distributed UIMA-based System for NLP},
booktitle = {Proceedings of the COLING 2016 System Demonstrations},
organization = {Federated Conference on Computer Science and
Information Systems},
location = {Osaka, Japan},
pdf = {https://www.texttechnologylab.org/wp-content/uploads/2018/03/TextImager2016.pdf},
year = {2016}
}