@article{SisLab1592, volume = {49}, number = {3}, author = {Phuong Thai Nguyen and Anh Cuong Le and Tu Bao Ho and Van Hiep Nguyen}, title = {Vietnamese treebank construction and entropy-based error detection}, publisher = {Springer}, year = {2015}, journal = {Language Resources and Evaluation}, doi = {doi:10.1007/s10579-015-9308-5}, pages = {487--519}, url = {https://eprints.uet.vnu.edu.vn/eprints/id/eprint/1592/}, abstract = {Treebanks, especially the Penn treebank for natural language processing (NLP) in English, play an essential role in both research into and the application of NLP. However, many languages still lack treebanks and building a treebank can be very complicated and difficult. This work has a twofold objective. Firstly, to share our results in constructing a large Vietnamese treebank (VTB) with three levels of annotation including word segmentation, part-of-speech tagging, and syntactic analysis. Major steps in the treebank construction process are described with particular regard to specific Vietnamese properties such as lack of word delimiter and isolation. Those properties make sentences highly syntactically ambiguous, and therefore it is difficult to ensure a high level of agreement among annotators. Various studies of Vietnamese syntax were employed not only to define annotations but also to systematically deal with ambiguities. Annotators were supported by automatic labelling tools, which are based on statistical machine learning methods, for sentence pre-processing and a tree editor for supporting manual annotation. As a result, an annotation agreement of around 90 \% was achieved. Our second objective is to present our method for automatically finding errors and inconsistencies in treebank corpora and its application to the construction of the VTB. This method employs the Shannon entropy measure in a manner that the more reduced entropy the more corrected errors in a treebank. The method ranks error candidates by using a scoring function based on conditional entropy. Our experiments showed that this method detected high-error-density subsets of original error candidate sets, and that the corpus entropy was significantly reduced after error correction. The size of these subsets was only about one third of the whole set, while these subsets contained 80?90 \% of the total errors. This method can also be applied to languages similar to Vietnamese.} }