#!/usr/bin/perl -w $fileName = $ARGV[0]; %ctr = (); @FMT=(); @cmp = (); open(INPUT, "$fileName") || die "File not found\n"; @data = ; close(INPUT); foreach $data(@data) { chomp($data); ## *** The line below needs to be changed according to the dataset ** ($PYword, $KYword) = split("\\|",$data); $ctr{$PYword}++; } $count=0; foreach $ctrData(sort keys %ctr) { #print "# $ctrData $ctr{$ctrData} $count\n"; push (@FMT,"# $ctrData $ctr{$ctrData} $count\n"); push (@cmp,"$ctrData#$count"); $count++; } $tmp = ""; @wds_all = (); foreach $data(@data) { @wds = (); chomp($data); # print "Data : $data\n"; ($PYword, $KYword) = split("\\|",$data); # $KYword =~s/\*//g; # $KYword =~s/\+//g; # $KYword =~s/^\s+//; # $KYword =~s/\s$//; # $KYword =~s/\"//g; # $KYword =~s/\s/-/gi; # print "$KYword\n"; (@wds) = split(" ",$KYword); foreach $wds(@wds) { chomp ($wds); $wds =~s/^\s+//; $wds =~s/\s$//; $wds =~s/\*//g; $wds =~s/\+//g; $wds =~s/\"//g; $wds =~s/\s/_/gi; $wds =~s/://g; # print ">$KYword\n"; if($wds ne '') { #print "$wds#$PYword\n"; push (@wds_all,"$wds#$PYword"); } } } %ctrWDS = (); #print "printing word and year occurence\n"; foreach $wds_all(@wds_all) { # print ">>>:$wds_all\n"; $ctrWDS{$wds_all}++; } $tmpStr = ""; $bin =0; foreach $ctrWDSData(sort keys %ctrWDS) { #split the word-data array ($wrd,$yer) = split("#",$ctrWDSData); foreach $dat(@cmp) { #split the year-data array ($w1,$w2) = split("#",$dat); #compare the years; if same we have found the bin number! if($yer eq $w1) { $bin = $w2; } } $tmpStr = "$bin $wrd"; #construct the formatted output again push (@FMT,"$tmpStr $ctrWDS{$ctrWDSData}\n"); #$count++; } #print "---- printint output ----\n"; foreach $fmt(@FMT) { print "$fmt"; }