#!/usr/local/bin/perl -w ###################################################### # ted pedersen # january 23, 2001 # # compute the token / type ratio for a text. display # the frequency of the word types, in sorted numeric # order. Note that input can come from files named # on the command line (in which case the file names # are stored in @ARGV) or from user input (no command # line arguments) that is terminated by cntl-d. ###################################################### $tokens = 0; $types = 0; while (<>) { @words = split(/[^a-zA-Z0-9]+/,$_); foreach $word (@words) { $count{lc($word)}++; if ($count{lc($word)} == 1) { $types++; } $tokens++; } } printf ("%d = number of tokens\n", $tokens); printf ("%d = number of types\n", $types); printf ("%6.4f = token to type ratio\n", $tokens/$types); # print in alphabetic order #foreach $x (sort keys %count) { # printf("%15s %6.4f %8d\n", $x, $count{$x}/$tokens, $count{$x}); #} # print in numeric order foreach $x (sort {$count{$b} <=> $count{$a}} keys %count) { printf("%15s %6.4f %8d\n", $x, $count{$x}/$tokens, $count{$x}); }