Dr. Wahed Hemati – Text Technology Lab

Publications

2024

Dominik Mattern, Wahed Hemati, Andy Lücking and Alexander Mehler. Sep., 2024. On German verb sense disambiguation: A three-part approach based on linking a sense inventory (GermaNet) to a corpus through annotation (TGVCorp) and using the corpus to train a VSD classifier (TTvSense). Journal of Language Modelling, 12(1):155–212.

BibTeX

@article{Mattern:Hemati:Lücking:Mehler:2024,
  author    = {Mattern, Dominik and Hemati, Wahed and Lücking, Andy and Mehler, Alexander},
  title     = {On German verb sense disambiguation: A three-part approach based
               on linking a sense inventory (GermaNet) to a corpus through annotation
               (TGVCorp) and using the corpus to train a VSD classifier (TTvSense)},
  abstractnote = {We develop a three-part approach to Verb Sense Disambiguation (VSD) in German. After considering a set of lexical resources and corpora, we arrive at a statistically motivated selection of a subset of verbs and their senses from GermaNet. This sub-inventory is then used to disambiguate the occurrences of the corresponding verbs in a corpus resulting from the union of TüBa-D/Z, Salsa, and E-VALBU. The corpus annotated in this way is called TGVCorp. It is used in the third part of the paper for training a classifier for VSD and for its comparative evaluation with a state-of-the-art approach in this research area, namely EWISER. Our simple classifier outperforms the transformer-based approach on the same data in both accuracy and speed in German but not in English and we discuss possible reasons.},
  journal   = {Journal of Language Modelling},
  volume    = {12},
  number    = {1},
  year      = {2024},
  month     = {Sep.},
  pages     = {155–212},
  url       = {https://jlm.ipipan.waw.pl/index.php/JLM/article/view/356}
}

2021

Maxim Konca, Alexander Mehler, Daniel Baumartz and Wahed Hemati. 2021. From distinguishability to informativity. A quantitative text model for detecting random texts.. Language and Text: Data, models, information and applications, 356:145–162.

BibTeX

@article{Konca:et:al:2021,
  title     = {From distinguishability to informativity. A quantitative text
               model for detecting random texts.},
  author    = {Konca, Maxim and Mehler, Alexander and Baumartz, Daniel and Hemati, Wahed},
  journal   = {Language and Text: Data, models, information and applications},
  volume    = {356},
  pages     = {145--162},
  year      = {2021},
  editor    = {Adam Paw{\l}owski, Jan Ma{\v{c}}utek, Sheila Embleton and George Mikros},
  publisher = {John Benjamins Publishing Company},
  doi       = {10.1075/cilt.356.10kon}
}

2020

Wahed Hemati. 2020. PhD Thesis: TextImager-VSD : large scale verb sense disambiguation and named entity recognition in the context of TextImager.

BibTeX

@phdthesis{Hemati:2020,
  author    = {Wahed Hemati},
  title     = {TextImager-VSD : large scale verb sense disambiguation and named
               entity recognition in the context of TextImager},
  pages     = {174},
  year      = {2020},
  url       = {http://publikationen.ub.uni-frankfurt.de/frontdoor/index/index/docId/56089},
  pdf       = {http://publikationen.ub.uni-frankfurt.de/files/56089/dissertation_Wahed_Hemati.pdf}
}

Alexander Mehler, Wahed Hemati, Pascal Welke, Maxim Konca and Tolga Uslu. 2020. Multiple Texts as a Limiting Factor in Online Learning: Quantifying (Dis-)similarities of Knowledge Networks. Frontiers in Education, 5:206.

BibTeX

@article{Mehler:Hemati:Welke:Konca:Uslu:2020,
  abstract  = {We test the hypothesis that the extent to which one obtains information
               on a given topic through Wikipedia depends on the language in
               which it is consulted. Controlling the size factor, we investigate
               this hypothesis for a number of 25 subject areas. Since Wikipedia
               is a central part of the web-based information landscape, this
               indicates a language-related, linguistic bias. The article therefore
               deals with the question of whether Wikipedia exhibits this kind
               of linguistic relativity or not. From the perspective of educational
               science, the article develops a computational model of the information
               landscape from which multiple texts are drawn as typical input
               of web-based reading. For this purpose, it develops a hybrid model
               of intra- and intertextual similarity of different parts of the
               information landscape and tests this model on the example of 35
               languages and corresponding Wikipedias. In the way it measures
               the similarities of hypertexts, the article goes beyond existing
               approaches by examining their structural and semantic aspects
               intra- and intertextually. In this way it builds a bridge between
               reading research, educational science, Wikipedia research and
               computational linguistics.},
  author    = {Mehler, Alexander and Hemati, Wahed and Welke, Pascal and Konca, Maxim
               and Uslu, Tolga},
  doi       = {10.3389/feduc.2020.562670},
  issn      = {2504-284X},
  journal   = {Frontiers in Education},
  pages     = {206},
  title     = {Multiple Texts as a Limiting Factor in Online Learning: Quantifying
               (Dis-)similarities of Knowledge Networks},
  url       = {https://www.frontiersin.org/article/10.3389/feduc.2020.562670},
  pdf       = {https://www.frontiersin.org/articles/10.3389/feduc.2020.562670/pdf},
  volume    = {5},
  year      = {2020}
}

Christine Driller, Markus Koch, Giuseppe Abrami, Wahed Hemati, Andy Lücking, Alexander Mehler, Adrian Pachzelt and Gerwin Kasperek. 2020. Fast and Easy Access to Central European Biodiversity Data with BIOfid. Biodiversity Information Science and Standards, 4:e59157.

BibTeX

@article{Driller:et:al:2020,
  author    = {Christine Driller and Markus Koch and Giuseppe Abrami and Wahed Hemati
               and Andy Lücking and Alexander Mehler and Adrian Pachzelt and Gerwin Kasperek},
  title     = {Fast and Easy Access to Central European Biodiversity Data with BIOfid},
  volume    = {4},
  number    = {},
  year      = {2020},
  doi       = {10.3897/biss.4.59157},
  publisher = {Pensoft Publishers},
  abstract  = {The storage of data in public repositories such as the Global
               Biodiversity Information Facility (GBIF) or the National Center
               for Biotechnology Information (NCBI) is nowadays stipulated in
               the policies of many publishers in order to facilitate data replication
               or proliferation. Species occurrence records contained in legacy
               printed literature are no exception to this. The extent of their
               digital and machine-readable availability, however, is still far
               from matching the existing data volume (Thessen and Parr 2014).
               But precisely these data are becoming more and more relevant to
               the investigation of ongoing loss of biodiversity. In order to
               extract species occurrence records at a larger scale from available
               publications, one has to apply specialised text mining tools.
               However, such tools are in short supply especially for scientific
               literature in the German language.The Specialised Information
               Service Biodiversity Research*1 BIOfid (Koch et al. 2017) aims
               at reducing this desideratum, inter alia, by preparing a searchable
               text corpus semantically enriched by a new kind of multi-label
               annotation. For this purpose, we feed manual annotations into
               automatic, machine-learning annotators. This mixture of automatic
               and manual methods is needed, because BIOfid approaches a new
               application area with respect to language (mainly German of the
               19th century), text type (biological reports), and linguistic
               focus (technical and everyday language).We will present current
               results of the performance of BIOfid’s semantic search engine
               and the application of independent natural language processing
               (NLP) tools. Most of these are freely available online, such as
               TextImager (Hemati et al. 2016). We will show how TextImager is
               tied into the BIOfid pipeline and how it is made scalable (e.g.
               extendible by further modules) and usable on different systems
               (docker containers).Further, we will provide a short introduction
               to generating machine-learning training data using TextAnnotator
               (Abrami et al. 2019) for multi-label annotation. Annotation reproducibility
               can be assessed by the implementation of inter-annotator agreement
               methods (Abrami et al. 2020). Beyond taxon recognition and entity
               linking, we place particular emphasis on location and time information.
               For this purpose, our annotation tag-set combines general categories
               and biology-specific categories (including taxonomic names) with
               location and time ontologies. The application of the annotation
               categories is regimented by annotation guidelines (Lücking et
               al. 2020). Within the next years, our work deliverable will be
               a semantically accessible and data-extractable text corpus of
               around two million pages. In this way, BIOfid is creating a new
               valuable resource that expands our knowledge of biodiversity and
               its determinants.},
  issn      = {},
  pages     = {e59157},
  url       = {https://doi.org/10.3897/biss.4.59157},
  eprint    = {https://doi.org/10.3897/biss.4.59157},
  journal   = {Biodiversity Information Science and Standards},
  keywords  = {biofid}
}

Manuel Stoeckel, Alexander Henlein, Wahed Hemati and Alexander Mehler. May, 2020. Voting for POS tagging of Latin texts: Using the flair of FLAIR to better Ensemble Classifiers by Example of Latin. Proceedings of LT4HALA 2020 - 1st Workshop on Language Technologies for Historical and Ancient Languages, 130–135.

BibTeX

@inproceedings{Stoeckel:et:al:2020,
  author    = {Stoeckel, Manuel and Henlein, Alexander and Hemati, Wahed and Mehler, Alexander},
  title     = {{Voting for POS tagging of Latin texts: Using the flair of FLAIR
               to better Ensemble Classifiers by Example of Latin}},
  booktitle = {Proceedings of LT4HALA 2020 - 1st Workshop on Language Technologies
               for Historical and Ancient Languages},
  month     = {May},
  year      = {2020},
  address   = {Marseille, France},
  publisher = {European Language Resources Association (ELRA)},
  pages     = {130--135},
  abstract  = {Despite the great importance of the Latin language in the past,
               there are relatively few resources available today to develop
               modern NLP tools for this language. Therefore, the EvaLatin Shared
               Task for Lemmatization and Part-of-Speech (POS) tagging was published
               in the LT4HALA workshop. In our work, we dealt with the second
               EvaLatin task, that is, POS tagging. Since most of the available
               Latin word embeddings were trained on either few or inaccurate
               data, we trained several embeddings on better data in the first
               step. Based on these embeddings, we trained several state-of-the-art
               taggers and used them as input for an ensemble classifier called
               LSTMVoter. We were able to achieve the best results for both the
               cross-genre and the cross-time task (90.64\% and 87.00\%) without
               using additional annotated data (closed modality). In the meantime,
               we further improved the system and achieved even better results
               (96.91\% on classical, 90.87\% on cross-genre and 87.35\% on cross-time).},
  url       = {https://www.aclweb.org/anthology/2020.lt4hala-1.21},
  pdf       = {http://www.lrec-conf.org/proceedings/lrec2020/workshops/LT4HALA/pdf/2020.lt4hala-1.21.pdf}
}

Alexander Mehler, Bernhard Jussen, Tim Geelhaar, Alexander Henlein, Giuseppe Abrami, Daniel Baumartz, Tolga Uslu and Wahed Hemati. 2020. The Frankfurt Latin Lexicon. From Morphological Expansion and Word Embeddings to SemioGraphs. Studi e Saggi Linguistici, 58(1):121–155.

BibTeX

@article{Mehler:et:al:2020b,
  author    = {Mehler, Alexander and Jussen, Bernhard and Geelhaar, Tim and Henlein, Alexander
               and Abrami, Giuseppe and Baumartz, Daniel and Uslu, Tolga and Hemati, Wahed},
  title     = {{The Frankfurt Latin Lexicon. From Morphological Expansion and
               Word Embeddings to SemioGraphs}},
  journal   = {Studi e Saggi Linguistici},
  doi       = {10.4454/ssl.v58i1.276},
  year      = {2020},
  volume    = {58},
  number    = {1},
  pages     = {121--155},
  abstract  = {In this article we present the Frankfurt Latin Lexicon (FLL),
               a lexical resource for Medieval Latin that is used both for the
               lemmatization of Latin texts and for the post-editing of lemmatizations.
               We describe recent advances in the development of lemmatizers
               and test them against the Capitularies corpus (comprising Frankish
               royal edicts, mid-6th to mid-9th century), a corpus created as
               a reference for processing Medieval Latin. We also consider the
               post-correction of lemmatizations using a limited crowdsourcing
               process aimed at continuous review and updating of the FLL. Starting
               from the texts resulting from this lemmatization process, we describe
               the extension of the FLL by means of word embeddings, whose interactive
               traversing by means of SemioGraphs completes the digital enhanced
               hermeneutic circle. In this way, the article argues for a more
               comprehensive understanding of lemmatization, encompassing classical
               machine learning as well as intellectual post-corrections and,
               in particular, human computation in the form of interpretation
               processes based on graph representations of the underlying lexical
               resources.},
  url       = {https://www.studiesaggilinguistici.it/index.php/ssl/article/view/276},
  pdf       = {https://www.studiesaggilinguistici.it/index.php/ssl/article/download/276/219}
}

Jonathan Hildebrand, Wahed Hemati and Alexander Mehler. May, 2020. Recognizing Sentence-level Logical Document Structures with the Help of Context-free Grammars. Proceedings of The 12th Language Resources and Evaluation Conference, 5282–5290.

BibTeX

@inproceedings{Hildebrand:Hemati:Mehler:2020,
  author    = {Hildebrand, Jonathan and Hemati, Wahed and Mehler, Alexander},
  title     = {Recognizing Sentence-level Logical Document Structures with the
               Help of Context-free Grammars},
  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference},
  month     = {May},
  year      = {2020},
  address   = {Marseille, France},
  publisher = {European Language Resources Association},
  pages     = {5282--5290},
  abstract  = {Current sentence boundary detectors split documents into sequentially
               ordered sentences by detecting their beginnings and ends. Sentences,
               however, are more deeply structured even on this side of constituent
               and dependency structure: they can consist of a main sentence
               and several subordinate clauses as well as further segments (e.g.
               inserts in parentheses); they can even recursively embed whole
               sentences and then contain multiple sentence beginnings and ends.
               In this paper, we introduce a tool that segments sentences into
               tree structures to detect this type of recursive structure. To
               this end, we retrain different constituency parsers with the help
               of modified training data to transform them into sentence segmenters.
               With these segmenters, documents are mapped to sequences of sentence-related
               “logical document structures”. The resulting segmenters aim to
               improve downstream tasks by providing additional structural information.
               In this context, we experiment with German dependency parsing.
               We show that for certain sentence categories, which can be determined
               automatically, improvements in German dependency parsing can be
               achieved using our segmenter for preprocessing. The assumption
               suggests that improvements in other languages and tasks can be
               achieved.},
  url       = {https://www.aclweb.org/anthology/2020.lrec-1.650},
  pdf       = {http://www.lrec-conf.org/proceedings/lrec2020/pdf/2020.lrec-1.650.pdf}
}

Alexander Mehler, Rüdiger Gleim, Regina Gaitsch, Tolga Uslu and Wahed Hemati. 2020. From Topic Networks to Distributed Cognitive Maps: Zipfian Topic Universes in the Area of Volunteered Geographic Information. Complexity, 4:1–47.

BibTeX

@article{Mehler:Gleim:Gaitsch:Uslu:Hemati:2020,
  author    = {Alexander Mehler and R{\"{u}}diger Gleim and Regina Gaitsch and Tolga Uslu
               and Wahed Hemati},
  title     = {From Topic Networks to Distributed Cognitive Maps: {Zipfian} Topic
               Universes in the Area of Volunteered Geographic Information},
  journal   = {Complexity},
  volume    = {4},
  doi       = {10.1155/2020/4607025},
  pages     = {1-47},
  issuetitle = {Cognitive Network Science: A New Frontier},
  year      = {2020}
}

2019

Manuel Stoeckel, Wahed Hemati and Alexander Mehler. November, 2019. When Specialization Helps: Using Pooled Contextualized Embeddings to Detect Chemical and Biomedical Entities in Spanish. Proceedings of The 5th Workshop on BioNLP Open Shared Tasks, 11–15.

BibTeX

@inproceedings{Stoeckel:Hemati:Mehler:2019,
  title     = {When Specialization Helps: Using Pooled Contextualized Embeddings
               to Detect Chemical and Biomedical Entities in {S}panish},
  author    = {Stoeckel, Manuel and Hemati, Wahed and Mehler, Alexander},
  booktitle = {Proceedings of The 5th Workshop on BioNLP Open Shared Tasks},
  month     = {nov},
  year      = {2019},
  address   = {Hong Kong, China},
  publisher = {Association for Computational Linguistics},
  url       = {https://www.aclweb.org/anthology/D19-5702},
  doi       = {10.18653/v1/D19-5702},
  pages     = {11--15},
  abstract  = {The recognition of pharmacological substances, compounds and proteins
               is an essential preliminary work for the recognition of relations
               between chemicals and other biomedically relevant units. In this
               paper, we describe an approach to Task 1 of the PharmaCoNER Challenge,
               which involves the recognition of mentions of chemicals and drugs
               in Spanish medical texts. We train a state-of-the-art BiLSTM-CRF
               sequence tagger with stacked Pooled Contextualized Embeddings,
               word and sub-word embeddings using the open-source framework FLAIR.
               We present a new corpus composed of articles and papers from Spanish
               health science journals, termed the Spanish Health Corpus, and
               use it to train domain-specific embeddings which we incorporate
               in our model training. We achieve a result of 89.76{\%} F1-score
               using pre-trained embeddings and are able to improve these results
               to 90.52{\%} F1-score using specialized embeddings.}
}

Alex Hunziker, Hasanagha Mammadov, Wahed Hemati and Alexander Mehler. 2019. Corpus2Wiki: A MediaWiki-based Tool for Automatically Generating Wikiditions in Digital Humanities. INF-DH-2019.

BibTeX

@inproceedings{Hunziker:et:al:2019,
  author    = {Hunziker, Alex and Mammadov, Hasanagha and Hemati, Wahed and Mehler, Alexander},
  title     = {{Corpus2Wiki}: A MediaWiki-based Tool for Automatically Generating
               Wikiditions in Digital Humanities},
  booktitle = {INF-DH-2019},
  year      = {2019},
  editor    = {Burghardt, Manuel AND Müller-Birn, Claudia},
  publisher = {Gesellschaft für Informatik e.V.},
  address   = {Bonn}
}

Wahed Hemati and Alexander Mehler. March, 2019. CRFVoter: gene and protein related object recognition using a conglomerate of CRF-based tools. Journal of Cheminformatics, 11(1):11.

BibTeX

@article{Hemati:Mehler:2019b,
  author    = {Hemati, Wahed and Mehler, Alexander},
  title     = {{{CRFVoter}: gene and protein related object recognition using
               a conglomerate of CRF-based tools}},
  journal   = {Journal of Cheminformatics},
  year      = {2019},
  month     = {Mar},
  day       = {14},
  volume    = {11},
  number    = {1},
  pages     = {11},
  abstract  = {Gene and protein related objects are an important class of entities
               in biomedical research, whose identification and extraction from
               scientific articles is attracting increasing interest. In this
               work, we describe an approach to the BioCreative V.5 challenge
               regarding the recognition and classification of gene and protein
               related objects. For this purpose, we transform the task as posed
               by BioCreative V.5 into a sequence labeling problem. We present
               a series of sequence labeling systems that we used and adapted
               in our experiments for solving this task. Our experiments show
               how to optimize the hyperparameters of the classifiers involved.
               To this end, we utilize various algorithms for hyperparameter
               optimization. Finally, we present CRFVoter, a two-stage application
               of Conditional Random Field (CRF) that integrates the optimized
               sequence labelers from our study into one ensemble classifier.},
  issn      = {1758-2946},
  doi       = {10.1186/s13321-019-0343-x},
  url       = {https://doi.org/10.1186/s13321-019-0343-x}
}

Wahed Hemati and Alexander Mehler. January, 2019. LSTMVoter: chemical named entity recognition using a conglomerate of sequence labeling tools. Journal of Cheminformatics, 11(1):7.

BibTeX

@article{Hemati:Mehler:2019a,
  abstract  = {Chemical and biomedical named entity recognition (NER) is an essential
               preprocessing task in natural language processing. The identification
               and extraction of named entities from scientific articles is also
               attracting increasing interest in many scientific disciplines.
               Locating chemical named entities in the literature is an essential
               step in chemical text mining pipelines for identifying chemical
               mentions, their properties, and relations as discussed in the
               literature. In this work, we describe an approach to the BioCreative
               V.5 challenge regarding the recognition and classification of
               chemical named entities. For this purpose, we transform the task
               of NER into a sequence labeling problem. We present a series of
               sequence labeling systems that we used, adapted and optimized
               in our experiments for solving this task. To this end, we experiment
               with hyperparameter optimization. Finally, we present LSTMVoter,
               a two-stage application of recurrent neural networks that integrates
               the optimized sequence labelers from our study into a single ensemble
               classifier.},
  author    = {Hemati, Wahed and Mehler, Alexander},
  day       = {10},
  doi       = {10.1186/s13321-018-0327-2},
  issn      = {1758-2946},
  journal   = {Journal of Cheminformatics},
  month     = {Jan},
  number    = {1},
  pages     = {7},
  title     = {{{LSTMVoter}: chemical named entity recognition using a conglomerate
               of sequence labeling tools}},
  url       = {https://doi.org/10.1186/s13321-018-0327-2},
  volume    = {11},
  year      = {2019}
}

Wahed Hemati, Alexander Mehler, Tolga Uslu and Giuseppe Abrami. 2019. Der TextImager als Front- und Backend für das verteilte NLP von Big Digital Humanities Data. Proceedings of the 6th Digital Humanities Conference in the German-speaking Countries, DHd 2019.

BibTeX

@inproceedings{Hemati:Mehler:Uslu:Abrami:2019,
  author    = {Hemati, Wahed and Mehler, Alexander and Uslu, Tolga and Abrami, Giuseppe},
  title     = {{Der TextImager als Front- und Backend für das verteilte NLP von
               Big Digital Humanities Data}},
  booktitle = {Proceedings of the 6th Digital Humanities Conference in the German-speaking
               Countries, DHd 2019},
  series    = {DHd 2019},
  pdf       = {https://www.texttechnologylab.org/wp-content/uploads/2019/04/Der-TextImager-als-Fron-und-Backend.pdf},
  poster    = {https://www.texttechnologylab.org/wp-content/uploads/2019/04/DHD19_TextImager.pdf},
  location  = {Frankfurt, Germany},
  year      = {2019}
}

Rüdiger Gleim, Steffen Eger, Alexander Mehler, Tolga Uslu, Wahed Hemati, Andy Lücking, Alexander Henlein, Sven Kahlsdorf and Armin Hoenen. 2019. A practitioner's view: a survey and comparison of lemmatization and morphological tagging in German and Latin. Journal of Language Modeling.

BibTeX

@article{Gleim:Eger:Mehler:2019,
  author    = {Gleim, R\"{u}diger and Eger, Steffen and Mehler, Alexander and Uslu, Tolga
               and Hemati, Wahed and L\"{u}cking, Andy and Henlein, Alexander and Kahlsdorf, Sven
               and Hoenen, Armin},
  title     = {A practitioner's view: a survey and comparison of lemmatization
               and morphological tagging in German and Latin},
  journal   = {Journal of Language Modeling},
  year      = {2019},
  pdf       = {https://www.texttechnologylab.org/wp-content/uploads/2019/07/jlm-tagging.pdf},
  doi       = {10.15398/jlm.v7i1.205},
  url       = {http://jlm.ipipan.waw.pl/index.php/JLM/article/view/205}
}

2018

Eleanor Rutherford, Wahed Hemati and Alexander Mehler. 2018. Corpus2Wiki: A MediaWiki based Annotation & Visualisation Tool for the Digital Humanities. INF-DH-2018.

BibTeX

@inproceedings{Rutherford:et:al:2018,
  author    = {Rutherford, Eleanor AND Hemati, Wahed AND Mehler, Alexander},
  title     = {{Corpus2Wiki}: A MediaWiki based Annotation \& Visualisation Tool
               for the Digital Humanities},
  booktitle = {INF-DH-2018},
  year      = {2018},
  editor    = {Burghardt, Manuel AND Müller-Birn, Claudia},
  publisher = {Gesellschaft für Informatik e.V.},
  address   = {Bonn}
}

Christine Driller, Markus Koch, Marco Schmidt, Claus Weiland, Thomas Hörnschemeyer, Thomas Hickler, Giuseppe Abrami, Sajawel Ahmed, Rüdiger Gleim, Wahed Hemati, Tolga Uslu, Alexander Mehler, Adrian Pachzelt, Jashar Rexhepi, Thomas Risse, Janina Schuster, Gerwin Kasperek and Angela Hausinger. 2018. Workflow and Current Achievements of BIOfid, an Information Service Mobilizing Biodiversity Data from Literature Sources. Biodiversity Information Science and Standards, 2:e25876.

BibTeX

@article{Driller:et:al:2018,
  author    = {Christine Driller and Markus Koch and Marco Schmidt and Claus Weiland
               and Thomas Hörnschemeyer and Thomas Hickler and Giuseppe Abrami and Sajawel Ahmed
               and Rüdiger Gleim and Wahed Hemati and Tolga Uslu and Alexander Mehler
               and Adrian Pachzelt and Jashar Rexhepi and Thomas Risse and Janina Schuster
               and Gerwin Kasperek and Angela Hausinger},
  title     = {Workflow and Current Achievements of BIOfid, an Information Service
               Mobilizing Biodiversity Data from Literature Sources},
  volume    = {2},
  number    = {},
  year      = {2018},
  doi       = {10.3897/biss.2.25876},
  publisher = {Pensoft Publishers},
  abstract  = {BIOfid is a specialized information service currently being developed
               to mobilize biodiversity data dormant in printed historical and
               modern literature and to offer a platform for open access journals
               on the science of biodiversity. Our team of librarians, computer
               scientists and biologists produce high-quality text digitizations,
               develop new text-mining tools and generate detailed ontologies
               enabling semantic text analysis and semantic search by means of
               user-specific queries. In a pilot project we focus on German publications
               on the distribution and ecology of vascular plants, birds, moths
               and butterflies extending back to the Linnaeus period about 250
               years ago. The three organism groups have been selected according
               to current demands of the relevant research community in Germany.
               The text corpus defined for this purpose comprises over 400 volumes
               with more than 100,000 pages to be digitized and will be complemented
               by journals from other digitization projects, copyright-free and
               project-related literature. With TextImager (Natural Language
               Processing & Text Visualization) and TextAnnotator (Discourse
               Semantic Annotation) we have already extended and launched tools
               that focus on the text-analytical section of our project. Furthermore,
               taxonomic and anatomical ontologies elaborated by us for the taxa
               prioritized by the project’s target group - German institutions
               and scientists active in biodiversity research - are constantly
               improved and expanded to maximize scientific data output. Our
               poster describes the general workflow of our project ranging from
               literature acquisition via software development, to data availability
               on the BIOfid web portal (http://biofid.de/), and the implementation
               into existing platforms which serve to promote global accessibility
               of biodiversity data.},
  issn      = {},
  pages     = {e25876},
  url       = {https://doi.org/10.3897/biss.2.25876},
  eprint    = {https://doi.org/10.3897/biss.2.25876},
  journal   = {Biodiversity Information Science and Standards},
  keywords  = {biofid}
}

Wahed Hemati, Alexander Mehler, Tolga Uslu, Daniel Baumartz and Giuseppe Abrami. 2018. Evaluating and Integrating Databases in the Area of NLP. International Quantitative Linguistics Conference (QUALICO 2018).

BibTeX

@inproceedings{Hemati:Mehler:Uslu:Baumartz:Abrami:2018,
  author    = {Wahed Hemati and Alexander Mehler and Tolga Uslu and Daniel Baumartz
               and Giuseppe Abrami},
  title     = {Evaluating and Integrating Databases in the Area of {NLP}},
  booktitle = {International Quantitative Linguistics Conference (QUALICO 2018)},
  year      = {2018},
  pdf       = {https://www.texttechnologylab.org/wp-content/uploads/2018/04/Hemat-Mehler-Uslu-Baumartz-Abrami-Qualico-2018.pdf},
  poster    = {https://www.texttechnologylab.org/wp-content/uploads/2018/10/qualico2018_databases_poster_hemati_mehler_uslu_baumartz_abrami.pdf},
  location  = {Wroclaw, Poland}
}

Alexander Mehler, Wahed Hemati, Rüdiger Gleim and Daniel Baumartz. 2018. VienNA: Auf dem Weg zu einer Infrastruktur für die verteilte interaktive evolutionäre Verarbeitung natürlicher Sprache. Forschungsinfrastrukturen und digitale Informationssysteme in der germanistischen Sprachwissenschaft, 6.

BibTeX

@incollection{Mehler:Hemati:Gleim:Baumartz:2018,
  author    = {Alexander Mehler and Wahed Hemati and Rüdiger Gleim and Daniel Baumartz},
  title     = {{VienNA: }{Auf dem Weg zu einer Infrastruktur für die verteilte
               interaktive evolutionäre Verarbeitung natürlicher Sprache}},
  booktitle = {Forschungsinfrastrukturen und digitale Informationssysteme in
               der germanistischen Sprachwissenschaft},
  publisher = {De Gruyter},
  editor    = {Henning Lobin and Roman Schneider and Andreas Witt},
  volume    = {6},
  address   = {Berlin},
  year      = {2018}
}

Alexander Mehler, Wahed Hemati, Tolga Uslu and Andy Lücking. 2018. A Multidimensional Model of Syntactic Dependency Trees for Authorship Attribution. Quantitative analysis of dependency structures.

BibTeX

@incollection{Mehler:Hemati:Uslu:Luecking:2018,
  author    = {Alexander Mehler and Wahed Hemati and Tolga Uslu and Andy Lücking},
  title     = {A Multidimensional Model of Syntactic Dependency Trees for Authorship
               Attribution},
  booktitle = {Quantitative analysis of dependency structures},
  publisher = {De Gruyter},
  editor    = {Jingyang Jiang and Haitao Liu},
  address   = {Berlin/New York},
  abstract  = {Abstract: In this chapter we introduce a multidimensional model
               of syntactic dependency trees. Our ultimate goal is to generate
               fingerprints of such trees to predict the author of the underlying
               sentences. The chapter makes a first attempt to create such fingerprints
               for sentence categorization via the detour of text categorization.
               We show that at text level, aggregated dependency structures actually
               provide information about authorship. At the same time, we show
               that this does not hold for topic detection. We evaluate our model
               using a quarter of a million sentences collected in two corpora:
               the first is sampled from literary texts, the second from Wikipedia
               articles. As a second finding of our approach, we show that quantitative
               models of dependency structure do not yet allow for detecting
               syntactic alignment in written communication. We conclude that
               this is mainly due to effects of lexical alignment on syntactic
               alignment.},
  keywords  = {Dependency structure, Authorship attribution, Text
                   categorization, Syntactic Alignment},
  year      = {2018}
}

Tolga Uslu, Lisa Miebach, Steffen Wolfsgruber, Michael Wagner, Klaus Fließbach, Rüdiger Gleim, Wahed Hemati, Alexander Henlein and Alexander Mehler. 2018. Automatic Classification in Memory Clinic Patients and in Depressive Patients. Proceedings of Resources and ProcessIng of linguistic, para-linguistic and extra-linguistic Data from people with various forms of cognitive/psychiatric impairments (RaPID-2).

BibTeX

@inproceedings{Uslu:et:al:2018:a,
  author    = {Tolga Uslu and Lisa Miebach and Steffen Wolfsgruber and Michael Wagner
               and Klaus Fließbach and Rüdiger Gleim and Wahed Hemati and Alexander Henlein
               and Alexander Mehler},
  title     = {{Automatic Classification in Memory Clinic Patients and in Depressive Patients}},
  booktitle = {Proceedings of Resources and ProcessIng of linguistic, para-linguistic
               and extra-linguistic Data from people with various forms of cognitive/psychiatric
               impairments (RaPID-2)},
  series    = {RaPID},
  location  = {Miyazaki, Japan},
  year      = {2018}
}

Tolga Uslu, Alexander Mehler, Daniel Baumartz, Alexander Henlein and Wahed Hemati. 2018. fastSense: An Efficient Word Sense Disambiguation Classifier. Proceedings of the 11th edition of the Language Resources and Evaluation Conference, May 7 - 12.

BibTeX

@inproceedings{Uslu:et:al:2018,
  author    = {Tolga Uslu and Alexander Mehler and Daniel Baumartz and Alexander Henlein
               and Wahed Hemati},
  title     = {fastSense: An Efficient Word Sense Disambiguation Classifier},
  booktitle = {Proceedings of the 11th edition of the Language Resources and
               Evaluation Conference, May 7 - 12},
  series    = {LREC 2018},
  address   = {Miyazaki, Japan},
  pdf       = {https://www.texttechnologylab.org/wp-content/uploads/2018/03/fastSense.pdf},
  year      = {2018}
}

Alexander Mehler, Olga Zlatkin-Troitschanskaia, Wahed Hemati, Dimitri Molerov, Andy Lücking and Susanne Schmidt. 2018. Integrating Computational Linguistic Analysis of Multilingual Learning Data and Educational Measurement Approaches to Explore Learning in Higher Education. In: Positive Learning in the Age of Information: A Blessing or a Curse?, 145–193. Ed. by Olga Zlatkin-Troitschanskaia, Gabriel Wittum and Andreas Dengel. Springer Fachmedien Wiesbaden.

BibTeX

@inbook{Mehler:et:al:2018,
  abstract  = {This chapter develops a computational linguistic model for analyzing
               and comparing multilingual data as well as its application to
               a large body of standardized assessment data from higher education.
               The approach employs both an automatic and a manual annotation
               of the data on several linguistic layers (including parts of speech,
               text structure and content). Quantitative features of the textual
               data are explored that are related to both the students' (domain-specific
               knowledge) test results and their level of academic experience.
               The respective analysis involves statistics of distance correlation,
               text categorization with respect to text types (questions and
               response options) as well as languages (English and German), and
               network analysis to assess dependencies between features. The
               correlation between correct test results of students and linguistic
               features of the verbal presentations of tests indicate to what
               extent language influences higher education test performance.
               It has also been found that this influence relates to specialized
               language. Thus, this integrative modeling approach contributes
               a test basis for a large-scale analysis of learning data and points
               to a number of subsequent, more detailed research questions.},
  address   = {Wiesbaden},
  author    = {Mehler, Alexander and Zlatkin-Troitschanskaia, Olga and Hemati, Wahed
               and Molerov, Dimitri and L{\"u}cking, Andy and Schmidt, Susanne},
  booktitle = {Positive Learning in the Age of Information: A Blessing or a Curse?},
  doi       = {10.1007/978-3-658-19567-0_10},
  editor    = {Zlatkin-Troitschanskaia, Olga and Wittum, Gabriel and Dengel, Andreas},
  isbn      = {978-3-658-19567-0},
  pages     = {145--193},
  publisher = {Springer Fachmedien Wiesbaden},
  title     = {Integrating Computational Linguistic Analysis of Multilingual
               Learning Data and Educational Measurement Approaches to Explore
               Learning in Higher Education},
  url       = {https://doi.org/10.1007/978-3-658-19567-0_10},
  year      = {2018}
}

Giuseppe Abrami, Sajawel Ahmed, Rüdiger Gleim, Wahed Hemati, Alexander Mehler and Uslu Tolga. March, 2018. Natural Language Processing and Text Mining for BIOfid.

BibTeX

@misc{Abrami:et:al:2018b,
  author    = {Abrami, Giuseppe and Ahmed, Sajawel and Gleim, R{\"u}diger and Hemati, Wahed
               and Mehler, Alexander and Uslu Tolga},
  title     = {{Natural Language Processing and Text Mining for BIOfid}},
  howpublished = {Presentation at the 1st Meeting of the Scientific Advisory Board of the BIOfid Project},
  adress    = {Goethe-University, Frankfurt am Main, Germany},
  year      = {2018},
  month     = {March},
  day       = {08},
  pdf       = {}
}

2017

Wahed Hemati, Alexander Mehler and Tolga Uslu. 2017. CRFVoter: Chemical Entity Mention, Gene and Protein Related Object recognition using a conglomerate of CRF based tools. BioCreative V.5. Proceedings.

BibTeX

@inproceedings{Hemati:Mehler:Uslu:2017,
  author    = {Wahed Hemati and Alexander Mehler and Tolga Uslu},
  title     = {{CRFVoter}: Chemical Entity Mention, Gene and Protein Related
               Object recognition using a conglomerate of CRF based tools},
  booktitle = {BioCreative V.5. Proceedings},
  pdf       = {https://www.texttechnologylab.org/wp-content/uploads/2018/03/CRFVoter.pdf},
  year      = {2017}
}

Wahed Hemati, Tolga Uslu and Alexander Mehler. 2017. TextImager as an interface to BeCalm. BioCreative V.5. Proceedings.

BibTeX

@inproceedings{Hemati:Uslu:Mehler:2017,
  author    = {Wahed Hemati and Tolga Uslu and Alexander Mehler},
  title     = {{TextImager} as an interface to {BeCalm}},
  booktitle = {BioCreative V.5. Proceedings},
  pdf       = {https://www.texttechnologylab.org/wp-content/uploads/2018/03/TextImager_BeCalm.pdf},
  year      = {2017}
}

Alexander Mehler, Rüdiger Gleim, Wahed Hemati and Tolga Uslu. 2017. Skalenfreie online soziale Lexika am Beispiel von Wiktionary. Proceedings of 53rd Annual Conference of the Institut für Deutsche Sprache (IDS), March 14-16, Mannheim, Germany. In German. Title translates into: Scale-free online-social Lexika by Example of Wiktionary.

BibTeX

@inproceedings{Mehler:Gleim:Hemati:Uslu:2017,
  author    = {Alexander Mehler and Rüdiger Gleim and Wahed Hemati and Tolga Uslu},
  title     = {{Skalenfreie online soziale Lexika am Beispiel von Wiktionary}},
  booktitle = {Proceedings of 53rd Annual Conference of the Institut für Deutsche
               Sprache (IDS), March 14-16, Mannheim, Germany},
  editor    = {Stefan Engelberg and Henning Lobin and Kathrin Steyer and Sascha Wolfer},
  address   = {Berlin},
  publisher = {De Gruyter},
  note      = {In German. Title translates into: Scale-free
                   online-social Lexika by Example of Wiktionary},
  abstract  = {In English: The paper deals with characteristics of the structural,
               thematic and participatory dynamics of collaboratively generated
               lexical networks. This is done by example of Wiktionary. Starting
               from a network-theoretical model in terms of so-called multi-layer
               networks, we describe Wiktionary as a scale-free lexicon. Systems
               of this sort are characterized by the fact that their content-related
               dynamics is determined by the underlying dynamics of collaborating
               authors. This happens in a way that social structure imprints
               on content structure. According to this conception, the unequal
               distribution of the activities of authors results in a correspondingly
               unequal distribution of the information units documented within
               the lexicon. The paper focuses on foundations for describing such
               systems starting from a parameter space which requires to deal
               with Wiktionary as an issue in big data analysis. In German: Der
               Beitrag thematisiert Eigenschaften der strukturellen, thematischen
               und partizipativen Dynamik kollaborativ erzeugter lexikalischer
               Netzwerke am Beispiel von Wiktionary. Ausgehend von einem netzwerktheoretischen
               Modell in Form so genannter Mehrebenennetzwerke wird Wiktionary
               als ein skalenfreies Lexikon beschrieben. Systeme dieser Art zeichnen
               sich dadurch aus, dass ihre inhaltliche Dynamik durch die zugrundeliegende
               Kollaborationsdynamik bestimmt wird, und zwar so, dass sich die
               soziale Struktur der entsprechenden inhaltlichen Struktur aufprägt.
               Dieser Auffassung gemäß führt die Ungleichverteilung der Aktivitäten
               von Lexikonproduzenten zu einer analogen Ungleichverteilung der
               im Lexikon dokumentierten Informationseinheiten. Der Beitrag thematisiert
               Grundlagen zur Beschreibung solcher Systeme ausgehend von einem
               Parameterraum, welcher die netzwerkanalytische Betrachtung von
               Wiktionary als Big-Data-Problem darstellt.},
  year      = {2017}
}

Tolga Uslu, Wahed Hemati, Alexander Mehler and Daniel Baumartz. 2017. TextImager as a Generic Interface to R. Software Demonstrations of the 15th Conference of the European Chapter of the Association for Computational Linguistics (EACL 2017).

BibTeX

@inproceedings{Uslu:Hemati:Mehler:Baumartz:2017,
  author    = {Tolga Uslu and Wahed Hemati and Alexander Mehler and Daniel Baumartz},
  title     = {{TextImager} as a Generic Interface to {R}},
  booktitle = {Software Demonstrations of the 15th Conference of the European
               Chapter of the Association for Computational Linguistics (EACL
               2017)},
  location  = {Valencia, Spain},
  pdf       = {https://www.texttechnologylab.org/wp-content/uploads/2018/03/TextImager.pdf},
  year      = {2017}
}

2016

Wahed Hemati, Tolga Uslu and Alexander Mehler. 2016. TextImager: a Distributed UIMA-based System for NLP. Proceedings of the COLING 2016 System Demonstrations.

BibTeX

@inproceedings{Hemati:Uslu:Mehler:2016,
  author    = {Wahed Hemati and Tolga Uslu and Alexander Mehler},
  title     = {TextImager: a Distributed UIMA-based System for NLP},
  booktitle = {Proceedings of the COLING 2016 System Demonstrations},
  organization = {Federated Conference on Computer Science and
                   Information Systems},
  location  = {Osaka, Japan},
  pdf       = {https://www.texttechnologylab.org/wp-content/uploads/2018/03/TextImager2016.pdf},
  year      = {2016}
}

Alexander Mehler, Tolga Uslu and Wahed Hemati. 2016. Text2voronoi: An Image-driven Approach to Differential Diagnosis. Proceedings of the 5th Workshop on Vision and Language (VL'16) hosted by the 54th Annual Meeting of the Association for Computational Linguistics (ACL), Berlin.

BibTeX

@inproceedings{Mehler:Uslu:Hemati:2016,
  author    = {Alexander Mehler and Tolga Uslu and Wahed Hemati},
  title     = {Text2voronoi: An Image-driven Approach to Differential Diagnosis},
  booktitle = {Proceedings of the 5th Workshop on Vision and Language (VL'16)
               hosted by the 54th Annual Meeting of the Association for Computational
               Linguistics (ACL), Berlin},
  pdf       = {https://aclweb.org/anthology/W/W16/W16-3212.pdf},
  year      = {2016}
}

Alexander Mehler, Rüdiger Gleim, Tim vor der Brück, Wahed Hemati, Tolga Uslu and Steffen Eger. 2016. Wikidition: Automatic Lexiconization and Linkiﬁcation of Text Corpora. Information Technology, 58:70–79.

BibTeX

@article{Mehler:et:al:2016,
  author    = {Alexander Mehler and Rüdiger Gleim and Tim vor der Brück and Wahed Hemati
               and Tolga Uslu and Steffen Eger},
  title     = {Wikidition: Automatic Lexiconization and Linkiﬁcation of Text Corpora},
  journal   = {Information Technology},
  volume    = {58},
  pages     = {70-79},
  abstract  = {We introduce a new text technology, called Wikidition, which automatically
               generates large scale editions of corpora of natural language
               texts. Wikidition combines a wide range of text mining tools for
               automatically linking lexical, sentential and textual units. This
               includes the extraction of corpus-specific lexica down to the
               level of syntactic words and their grammatical categories. To
               this end, we introduce a novel measure of text reuse and exemplify
               Wikidition by means of the capitularies, that is, a corpus of
               Medieval Latin texts.},
  doi       = {10.1515/itit-2015-0035},
  year      = {2016}
}