----------------------------------------------------------------------- Title: Exploration of three cluster stopping rules for the task of Word Sense Discrimination By: Anagha Kulkarni, Graduate student, University of Minnesota, Duluth, Department of Computer Science, and summer intern, Biomedical Informatics Research, Mayo Clinic, Rochester Supervisor: Dr. Guergana K. Savova, Senior Analyst, Division of Biomedical Informatics, Mayo Clinic, Rochester Abstract: Cluster analysis is performed to discover the groups that the data naturally falls into. The number of groups that a data separates out into is not known a priori; rather, it needs to be determined based on the data. Various techniques, referred to as the "stopping rules" in general, have been proposed in past to estimate the optimal number of clusters for a given data. We perform a comparative study of 3 such "stopping rules" namely Hartigan, 1974; Calinski and Harabasz, 1975; and Tibshirani et al., 2001 when applied to a natural language processing task of unsupervised word sense discrimination (WSD). Cluster stopping rules have been applied to a variety of domains - DNA microarray data (Tibshirani et al., 2001), bacteriological, anthropometric and plant-breeding data (Calinski and Harabasz, 1975), Olympic times data (Hartigan, 1974). In our study we experiment with the biomedical domain. We use SenseClusters, a package developed by the NLP group lead by Dr. Ted Pedersen, at University of Minnesota, Duluth and a clustering package - CLUTO developed at University of Minnesota, Twin Cities at Dr. Karypis' lab. ------------------------------------------------------------------------- Publication read and discussed in details: ------------------------------------------- 1. @article{TibshiraniWH01, author = {Tibshirani R., Walther G. and Hastie T.}, title = {Estimating the number of clusters in a data set via the gap statistic.}, journal = {Journal of the Royal Statistical Society}, volume = {63}, number = {2}, year = {2001}, pages = {411-424}} 2. @article{CalinskiH74, author = {Calinski T. and Harabasz J.}, title = {A dendrite method for cluster analysis.}, journal = {Communications in statistics}, volume = {3}, number = {1}, year = {1974}, pages = {1-27}} 3. @book{Hartigan75, author={Hartigan J.}, title={Clustering Algorithms}, year = {1975}, address = {New York, NY, US.}, publisher = {John Wiley and Sons}} 4. @book{Cohen95, author={Cohen P.}, title={Empirical methods for artificial intelligence}, year = {1995}, address = {Cambridge, Mass., US.}, publisher = {MIT Press}} 5. @book{Gordon99, author={Gordon A.}, title={Classification}, year = {1999}, address = {London}, publisher = {Chapman & Hall/CRC}} Publication read and discussed: ------------------------------- 1. @inproceedings{PedersenKB96, author = {Pedersen T. and Kayaalp M. and Bruce R.}, title = {Significant Lexical Relationships}, booktitle = {Proceedings of the Thirteenth National Conference on Artificial Intelligence}, address = {Portland, OR}, month = {August}, year = {1996}, pages = {455-460}} 2. @article{Patefield81, author = {Patefield W.}, title = {An Efficient Method of Generating Random {R} X {C} Tables with Given Row and Column Totals}, journal = {Applied Statistics}, volume = {30}, pages = {91-97}, year = {1981}} 3. @book{HastieTF00, author={Hastie T., Tibshirani R., Friedman J.}, title={The Elements of Statistical Learning: Data Mining, Inference, and Prediction}, year = {2000}, publisher = {Springer Verlag}} Distribution of time spent: --------------------------- 1. Meetings (Guergana, NLP group, Terry, Tanya (Statistician), Other presentations attended) : 20% 2. Discussions (Guergana, Ann Oberg (Statistician), Dan A., Dave Z., Dana, Patrick, Marcy, Jim B., Ted): 25% 3. Programming, testing, experimenting, releasing: 40% 4. Reading and Researching: 15% Software developed: ------------------- 1. Implementation of Gap Statistics (for small contexts) as a Perl Module. http://search.cpan.org/dist/Statistics-Gap/ 2. Implementation of stopping rule proposed by Calinski and Harabasz as a Perl Module. http://search.cpan.org/dist/Statistics-CalinskiHarabasz/ 3. Implementation of stopping rule proposed by Hartigan as a Perl Module. http://search.cpan.org/dist/Statistics-Hartigan/ Reports written: --------------- 1. Report on Gap Statistics for Natural Language Processing domain. Other resources: ---------------- 1. Scripts for experimental setup. 2. Scripts to extract and compile results. Other work products/deliverables: --------------------------------- None.