# ------------------------------------------------------------------------------------------ # MTKeywords # (a plugin for Movable Type v2.63) # # Release 1.01 # April 8, 2006 # # by Richard D. LeCour # http://www.richardsramblings.com/?p=370 # ------------------------------------------------------------------------------------------ # This software is provided as-is. # You may use it for commercial or personal use. # If you distribute it, please keep this notice intact. # # Copyright (c) 2004,2006 Richard D. LeCour # ------------------------------------------------------------------------------------------ # # USAGE: <$MTKeywords delimiter="" caseSensitive="true|false" includeBigrams="true|false"$> # # ------------------------------------------------------------------------------------------ package MT::Plugins::Keywords; use MT::Template::Context; use MT::Sanitize; MT::Template::Context->add_tag(Keywords => \&Keywords); sub Keywords { my $ctx = shift; my $args = shift; my $blog = $ctx->stash('blog'); my $entry = $ctx->stash('entry'); my ($result, $item, $previousword, $bigram, $keywordcount) = ""; my (%unigramcount, %bigramcount) = (); my $defaultDelimiter = ", "; my $defaultCaseSensitive = "true"; my $defaultIncludeBigrams = "true"; my $argDelimiter = $args->{delimiter}; my $argCaseSensitive = $args->{caseSensitive}; my $argIncludeBigrams = $args->{includeBigrams}; if (defined($argDelimiter)) { $defaultDelimiter = $argDelimiter; } if (defined($argCaseSensitive)) { $defaultCaseSensitive = $argCaseSensitive; } if (defined($argIncludeBigrams)) { $defaultIncludeBigrams = $argIncludeBigrams; } if (defined($entry)) { ### GET ENTRY TEXT ### my $body = $entry->text." ".$entry->title; my $comments = $entry->comments; for my $comment (@$comments) { $body .= " ".$comment->text; } ### STRIP OUT HTML AND SPACES ### $body = MT::Sanitize->sanitize($body); $body =~ s/&[a-z]+;*/ /g; $body =~ s/\.(aiff|arj|arts|asp|au|avi|bin|biz|css|cgi|com|doc|edu|exe|firm|gif|gz|gzip|htm|html|info|jpeg|jpg|js|jsp|mp3|mpeg|mpg|mov)(\b|\s|$)//g; $body =~ s/\.(net|nom|org|pdf|php|pl|qt|ra|ram|rec|shop|sit|tar|tgz|tiff|txt|wav|web|zip)(\b|\s|$)//g; $body =~ s/(^|\b|\s)(one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve)(\b|\s|$)/ /gi; $body =~ s/(^|\b|\s)(about|across|after|against|all|along|also|amid|among|an|and|are|around|as|at)(\b|\s|$)/ /gi; $body =~ s/(^|\b|\s)(before|behind|below|beneath|beyond|but|by|can|can't|come|did|didn't|don't|down|during)(\b|\s|$)/ /gi; $body =~ s/(^|\b|\s)(each|except|few|for|forward|from|had|has|have|her|hers|him|his|how|http|in|inside|into|is|it|its|just)(\b|\s|$)/ /gi; $body =~ s/(^|\b|\s)(like|many|more|near|no|non|not|now|of|off|on|once|only|onto|or|other|out|over|past|said|she|since|so|some|such)(\b|\s|$)/ /gi; $body =~ s/(^|\b|\s)(than|that|the|their|then|them|there|these|they|this|through|throughout|to|toward|too)(\b|\s|$)/ /gi; $body =~ s/(^|\b|\s)(use|very|was|way|were|what|when|where|which|who|will|with|within|without|won't|would|www|you|your)(\b|\s|$)/ /gi; $body = join (' ', split (/[\041-\100\132-\140\173-\177]/, $body)); $body =~ s/\s+/ /g; my @words = split (/\s/, $body); $keywordcount = 0; undef %unigramcount; undef %bigramcount; undef %trigramcount; if ($defaultCaseSensitive ne 'false') { foreach $item (@words) { $unigramcount{$item}++; if (lc $item ne $item) { $unigramcount{(lc $item)}++; } } } else { foreach $item (@words) { $unigramcount{lc $item}++; } } $previousword = ""; foreach $item (@words) { if (length($item) > 2 and length($previousword) > 2) { $bigram = lc ($previousword." ".$item); $bigramcount{$bigram}++; } $previousword = $item; } if ($defaultIncludeBigrams ne 'false') { foreach $item (sort { $bigramcount{$b} <=> $bigramcount{$a} } keys %bigramcount) { if ($bigramcount{$item} >= 2 and $keywordcount < 5) { if ($result ne '') { $result .= $defaultDelimiter; } $result .= $item; $keywordcount++; } } } foreach $item (sort { $unigramcount{$b} <=> $unigramcount{$a} } keys %unigramcount) { if (length($item) > 2 and $unigramcount{$item} > 1 and $keywordcount < 25) { if ($result ne '') { $result .= $defaultDelimiter; } $result .= $item; $keywordcount++; } } } return $result; } # --------------------------------------------------------------------------- # RELEASE NOTES # # 0.99 10/19/2004 original version # 0.99a 11/08/2004 included more basic file and web extensions to exclude # 0.99b 12/09/2004 included a few more words to exclude # fixed "Useless use of a variable" error seen with Perl v5.8.4 # 1.01 04/08/2006 added delimiter, caseSensitive, and includeBigram parameters # ---------------------------------------------------------------------------