Date: Wed, 20 Feb 2002 18:08:19 -0500 (EST) From: Max Pyziur To: Munazza Bukhari Cc: htdig-general@lists.sourceforge.net Subject: Re: [htdig] can htdig track the search query string [The following text is in the "X-UNKNOWN" character set] [Your display is set for the "US-ASCII" character set] [Some characters may be displayed incorrectly] On Wed, 20 Feb 2002, Munazza Bukhari wrote: > > Does htdig let you track the search query strings? I > am looking to improve > the content of my website so I want to figure out what > kind of things people > are searching for? > > I can write a miner for my webserver's access log file > but I was just wondering > if there was something out there already. I hacked something like this for my own website. Perhaps you might find it useful. I'll append to this reply. Since search terms are bilingual on my website are trilingual (English, Ukrainian and Russian, the latter two using cp1251 encoding) I setup some arrays (%HEXARRAYS)to filter/convert search terms which show up in the url as something like %20%21%23 ... and the like. You can amend or remove reference to the arrays (salt and pepper to your taste). This might give me some incentive to make this code a little cleaner and portable. One thing I noticed about htdig is that the first search is a POST request; consequently, the initial search term(s) isn't(aren't) in the logfile. However, subsequent searches are recorded using a GET request leaving the terms visible in the log entry. Max Pyziur BRAMA - Gateway Ukraine pyz@brama.com http://www.brama.com/ > Thanks, > Munazza > ############ begin script ######################### #!/usr/bin/perl $ACCESSLOG = $ARGV[0] ; $SEARCHTERMS = "/tmp/". $$ . "st.txt" ; $LOWERTHRESH = 40; $TODAY = `date`; %HEXARRAY = ( "\%20", " ", "\%21", "\!", "\%22", "\*", "\%23", "\#", "\%24", "\$", "\%25", "\%", "\%26", "\&", "\%27", "\'", "\%28", "\(", "\%29", "\)", "\%2A", "\*", "\%2B", "\+", "\%2C", "\,", "\%2D", "\-", "\%2E", "\.", "\%2F", "\/", "\%30", "0", "\%31", "1", "\%32", "2", "\%33", "3", "\%34", "4", "\%35", "5", "\%36", "6", "\%37", "7", "\%38", "8", "\%39", "9", "\%3A", "\:", "\%3B", "\;", "\%3C", "\<", "\%3D", "\=", "\%3E", "\>", "\%3F", "\?", "\%40", "\@", "\%41", "a", "\%42", "b", "\%43", "c", "\%44", "d", "\%45", "e", "\%46", "f", "\%47", "g", "\%48", "h", "\%49", "i", "\%4A", "j", "\%4B", "k", "\%4C", "l", "\%4D", "m", "\%4E", "n", "\%4F", "o", "\%50", "p", "\%51", "q", "\%52", "r", "\%53", "s", "\%54", "t", "\%55", "u", "\%56", "v", "\%57", "w", "\%58", "x", "\%59", "y", "\%5A", "z", "\%5B", "\[", "\%5C", "\\", "\%5D", "\]", "\%5E", "\^", "\%5F", "\_", "\%60", "\`", "\%61", "a", "\%62", "b", "\%63", "c", "\%64", "d", "\%65", "e", "\%66", "f", "\%67", "g", "\%68", "h", "\%69", "i", "\%6A", "j", "\%6B", "k", "\%6C", "l", "\%6D", "m", "\%6E", "n", "\%6F", "o", "\%70", "p", "\%71", "q", "\%72", "r", "\%73", "s", "\%74", "t", "\%75", "u", "\%76", "v", "\%77", "w", "\%78", "x", "\%79", "y", "\%7A", "z", "\%7B", "{", "\%7C", "|", "\%7D", "}", "\%7E", "~", "\%7F", " ", "\%80", " ", "\%81", "^Ã", "\%82", "\^Â", "\%83", "^Ã", "\%84", "\^Ä", "\%85", "\^Å", "\%86", "\^Æ", "\%87", "\^Ç", "\%88", " ", "\%89", "\^É", "\%8A", "^Ú", "\%8B", "\^Ë", "\%8C", "^Ü", "\%8D", "^Ý", "\%8E", " ", "\%8F", "^ß", "\%90", "^Ð", "\%91", "\^Ñ", "\%92", "\^Ò", "\%93", "\^Ó", "\%94", "\^Ô", "\%95", "\^Õ", "\%96", "\^Ö", "\%97", "\^Ö", "\%98", " ", "\%99", "\^Ù", "\%9A", "^Ú", "\%9B", "\^Û", "\%9C", "^Ü", "\%9D", "^Ý", "\%9E", " ", "\%9F", "^ß", "\%A0", "", "\%A1", "¢", "\%A2", "¢", "\%A3", " ", "\%A4", "¤", "\%A5", "´", "\%A6", "\¦", "\%A7", "§", "\%A8", "¸", "\%A9", "©", "\%AA", "º", "\%AB", "\«", "\%AC", "\¬", "\%AD", "\­", "\%AE", "®", "\%AF", "¿", "\%B0", "\°", "\%B1", "±", "\%B2", "³", "\%B3", "³", "\%B4", "´", "\%B5", "µ", "\%B6", "¶", "\%B7", "\·", "\%B8", "¸", "\%B9", "¹", "\%BA", "º", "\%BB", "\»", "\%BC", "¼", "\%BD", "¾", "\%BE", "¾", "\%BF", "¿", "\%C0", "à", "\%C1", "á", "\%C2", "â", "\%C3", "ã", "\%C4", "ä", "\%C5", "å", "\%C6", "æ", "\%C7", "ç", "\%C8", "è", "\%C9", "é", "\%CA", "ê", "\%CB", "ë", "\%CC", "ì", "\%CD", "í", "\%CE", "î", "\%CF", "ï", "\%D0", "ð", "\%D1", "ñ", "\%D2", "ò", "\%D3", "ó", "\%D4", "ô", "\%D5", "õ", "\%D6", "ö", "\%D7", "÷", "\%D8", "ø", "\%D9", "ù", "\%DA", "ú", "\%DB", "û", "\%DC", "ü", "\%DD", "ý", "\%DE", "þ", "\%DF", "^?", "\%E0", "à", "\%E1", "á", "\%E2", "â", "\%E3", "ã", "\%E4", "ä", "\%E5", "å", "\%E6", "æ", "\%E7", "ç", "\%E8", "è", "\%E9", "é", "\%EA", "ê", "\%EB", "ë", "\%EC", "ì", "\%ED", "í", "\%EE", "î", "\%EF", "ï", "\%F0", "ð", "\%F1", "ñ", "\%F2", "ò", "\%F3", "ó", "\%F4", "ô", "\%F5", "õ", "\%F6", "ö", "\%F7", "÷", "\%F8", "ø", "\%F9", "ù", "\%FA", "ú", "\%FB", "û", "\%FC", "ü", "\%FD", "ý", "\%FE", "þ", "\%FF", "^?", "\%20", " ", "\%21", "\!", "\%22", "\*", "\%23", "\#", "\%24", "\$", "\%25", "\%", "\%26", "\&", "\%27", "\'", "\%28", "\(", "\%29", "\)", "\%2a", "\*", "\%2b", "\+", "\%2c", "\,", "\%2d", "\-", "\%2e", "\.", "\%2f", "\/", "\%30", "0", "\%31", "1", "\%32", "2", "\%33", "3", "\%34", "4", "\%35", "5", "\%36", "6", "\%37", "7", "\%38", "8", "\%39", "9", "\%3a", "\:", "\%3b", "\;", "\%3c", "\<", "\%3d", "\=", "\%3e", "\>", "\%3f", "\?", "\%40", "\@", "\%41", "a", "\%42", "b", "\%43", "c", "\%44", "d", "\%45", "e", "\%46", "f", "\%47", "g", "\%48", "h", "\%49", "i", "\%4a", "j", "\%4b", "k", "\%4c", "l", "\%4d", "m", "\%4e", "n", "\%4f", "o", "\%50", "p", "\%51", "q", "\%52", "r", "\%53", "s", "\%54", "t", "\%55", "u", "\%56", "v", "\%57", "w", "\%58", "x", "\%59", "y", "\%5a", "z", "\%5b", "\[", "\%5c", "\\", "\%5d", "\]", "\%5e", "\^", "\%5f", "\_", "\%60", "\`", "\%61", "a", "\%62", "b", "\%63", "c", "\%64", "d", "\%65", "e", "\%66", "f", "\%67", "g", "\%68", "h", "\%69", "i", "\%6a", "j", "\%6b", "k", "\%6c", "l", "\%6d", "m", "\%6e", "n", "\%6f", "o", "\%70", "p", "\%71", "q", "\%72", "r", "\%73", "s", "\%74", "t", "\%75", "u", "\%76", "v", "\%77", "w", "\%78", "x", "\%79", "y", "\%7a", "z", "\%7b", "{", "\%7c", "|", "\%7d", "}", "\%7e", "~", "\%7f", " ", "\%80", " ", "\%81", "^Ã", "\%82", "\^Â", "\%83", "^Ã", "\%84", "\^Ä", "\%85", "\^Å", "\%86", "\^Æ", "\%87", "\^Ç", "\%88", " ", "\%89", "\^É", "\%8a", "^Ú", "\%8b", "\^Ë", "\%8c", "^Ü", "\%8d", "^Ý", "\%8e", " ", "\%8f", "^?", "\%90", "^Ð", "\%91", "\'", "\%92", "\'", "\%93", "\"", "\%94", "\"", "\%95", "\o", "\%96", "\-", "\%97", "\-", "\%98", " ", "\%99", "\^Ù", "\%9a", "^Ú", "\%9b", "\^Û", "\%9c", "^Ü", "\%9d", "^Ý", "\%9e", " ", "\%9f", "^?", "\%a0", "", "\%a1", "¢", "\%a2", "¢", "\%a3", " ", "\%a4", "¤", "\%a5", "´", "\%a6", "\¦", "\%a7", "§", "\%a8", "¸", "\%a9", "©", "\%aa", "º", "\%ab", "\«", "\%ac", "\ ", "\%ad", "\­", "\%ae", "®", "\%af", "¿", "\%b0", "\°", "\%b1", "±", "\%b2", "³", "\%b3", "³", "\%b4", "´", "\%b5", "µ", "\%b6", "", "\%b7", "\·", "\%b8", "¸", "\%b9", "¹", "\%ba", "º", "\%bb", "\»", "\%bc", "¼", "\%bd", "¾", "\%be", "¾", "\%bf", "¿", "\%c0", "à", "\%c1", "á", "\%c2", "â", "\%c3", "ã", "\%c4", "ä", "\%c5", "å", "\%c6", "æ", "\%c7", "ç", "\%c8", "è", "\%c9", "é", "\%ca", "ê", "\%cb", "ë", "\%cc", "ì", "\%cd", "í", "\%ce", "î", "\%cf", "ï", "\%d0", "ð", "\%d1", "ñ", "\%d2", "ò", "\%d3", "ó", "\%d4", "ô", "\%d5", "õ", "\%d6", "ö", "\%d7", "÷", "\%d8", "ø", "\%d9", "ù", "\%da", "ú", "\%db", "û", "\%dc", "ü", "\%dd", "ý", "\%de", "þ", "\%df", "^?", "\%e0", "à", "\%e1", "á", "\%e2", "â", "\%e3", "ã", "\%e4", "ä", "\%e5", "å", "\%e6", "æ", "\%e7", "ç", "\%e8", "è", "\%e9", "é", "\%ea", "ê", "\%eb", "ë", "\%ec", "ì", "\%ed", "í", "\%ee", "î", "\%ef", "ï", "\%f0", "ð", "\%f1", "ñ", "\%f2", "ò", "\%f3", "ó", "\%f4", "ô", "\%f5", "õ", "\%f6", "ö", "\%f7", "÷", "\%f8", "ø", "\%f9", "ù", "\%fa", "ú", "\%fb", "û", "\%fc", "ü", "\%fd", "ý", "\%fe", "þ", "\%ff", "^?", "À", "à", "Á", "á", "Â", "â", "Ã", "ã", "Ä", "ä", "Å", "å", "Æ", "æ", "Ç", "ç", "È", "è", "²", "³", "¯", "¿", "É", "é", "Ê", "ê", "Ë", "ë", "Ì", "ì", "Í", "í", "Î", "î", "Ï", "ï", "Ð", "ð", "Ñ", "ñ", "Ò", "ò", "Ó", "ó", "Ô", "ô", "Õ", "õ", "Ö", "ö", "×", "÷", "Ø", "ø", "Ù", "ù", "Ú", "ú", "Û", "û", "Ü", "ü", "ª", "ý", "Þ", "þ", "ß", "^?", "A", "a", "B", "b", "C", "c", "D", "d", "E", "e", "F", "f", "G", "g", "H", "h", "I", "i", "J", "j", "K", "k", "L", "l", "M", "m", "N", "n", "O", "o", "P", "p", "Q", "q", "R", "r", "S", "s", "T", "t", "U", "u", "V", "v", "W", "w", "X", "x", "Y", "y", "Z", "z" ) ; @HEXKEYS = keys(%HEXARRAY) ; @HEXVALUES = values(%HEXARRAY); getterms() ; prepterms() ; printdata() ; sub getterms() { open(ACCESSLOG, "egrep \'GET /cgi-bin/htsearch\' $ACCESSLOG | "); while () { chomp ; ($host, $rfc931, $authuser, $timestamp, $request, $status, $bytes) = /^(\S+) (\S+) (\S+) \[([^\]]*)\] \"([^"]*)\" (\S+) (\S+)/; $request =~ s/GET.+words\=//g ; $request =~ s/\;page.+HTTP.+//g ; $request =~ s/ HTTP.+//g ; $request =~ s/\+/ /g ; for ($i=0; $i<$#HEXKEYS; $i++) { $request =~ s/$HEXKEYS[$i]/$HEXVALUES[$i]/g ; } $request =~ s/\*/ /g ; $request =~ s/^ //g ; $request =~ s/^ //g ; $request =~ s/^ //g ; $request =~ s/^\t //g ; $request =~ s/get \/cgi-bin\/htsearch//g ; $request =~ s/^\+//g ; if ($request ne "") { $SEARCHTERM = $request ; $SEARCHTERMREQUESTS{$SEARCHTERM}++; } } close(ACCESSLOG) ; } sub prepterms() { @SEARCHTERMS = sort(keys(%SEARCHTERMREQUESTS)); # @SEARCHTERMREQUESTS = sort(values(%SEARCHTERMREQUESTS)); open(ST, ">$SEARCHTERMS") ; foreach $SEARCHTERM (@SEARCHTERMS) { printf ST "%5.0f %s\n", $SEARCHTERMREQUESTS{$SEARCHTERM}, $SEARCHTERM ; } close(ST); open(ST, "$SEARCHTERMS") ; @ST = ; @RST = reverse(sort(@ST)); close(ST); # foreach $SEARCHTERMREQUEST (@SEARCHTERMREQUESTS) { # printf ST "%5.0f %s\n", $SEARCHTERMREQUESTS{$SEARCHTERM}, $SEARCHTERM ; # } # foreach $SEARCHTERM (@SEARCHTERMS) { # printf "%5.0f %s\n", $SEARCHTERMREQUESTS{$SEARCHTERM}, $SEARCHTERM ; # } # foreach $SEARCHTERMREQUEST (@SEARCHTERMREQUESTS) { # printf "%5.0f %s\n", $SEARCHTERMREQUESTS{$SEARCHTERM}, $SEARCHTERM ; # } } # sub by_mostly_numeric() { ($a <=> $b || ($a cmp $b); } sub printdata() { print "\n"; print "BRAMA's Search Engine Search Terms \n"; print "\n"; print "\n"; print "

BRAMA' Search Engine Search Terms

\n"; print "Report Run $TODAY

\n"; print "Most Popular Search Terms\n"; print "

";

	for ($i=1; $i<$#RST; $i++) {
		print "\t$RST[$i]";
	}
	print "


"; print "
";
	print "Alphabetized Search Terms\n";

	foreach $SEARCHTERM (@SEARCHTERMS) {
		printf  "\t%5.0f  %s\n", $SEARCHTERMREQUESTS{$SEARCHTERM}, $SEARCHTERM ;
	}

	print "\n";

}

sub deltmpfiles() {
	unlink $SEARCHTERMS;
}


_______________________________________________
htdig-general mailing list 
To unsubscribe, send a message to  with a subject of unsubscribe
FAQ: http://htdig.sourceforge.net/FAQ.html