#!/usr/bin/perl # ice-form.pl -- cgi compliant ICE search interface # # ICE Version 1.4 beta 3 # July 1997 # (C) Christian Neuss (ice-man@next1.isa.informatik.th-darmstadt.de) # MAJOR OVERHAUL BY PETE. CALL IT VERSION 1.4P # NOW USES TEMPLATES # NOW IGNORES REMOVED COMMON WORDS # NOW IGNORES TOO SHORT WORDS # NOW RECOGNISES ABSENCE OF KEYWORDS #--- start of configuration --- put your changes here --- # Title or name of your server: # Example: local($title)="Search this server"; local($title)="Search Blather"; # search directories to present in the search dialogue # Example: # local(@directories)=( # "DZSIM (/www/projects/dzsim)", # "CSCW Laboratory (/www/projects/cscw-lab)", # ); local(@directories)=( 'Archives (/archives/)', 'Archives2 (/archives2/)', 'Archives3 (/archives3/)', 'Archives4 (/archives4/)', 'Articles (/articles/)', 'Articles 2003 (/articles2003/)', 'Shitegeist (/shitegeist/)', 'Forteana (/forteana/)', 'Bookstore (/bookstore/)', 'Gear (/gear/)', 'Consulting (/consulting/)', ); # Location of the indexfile: # Note: under Windows or Windows NT, add the drive letter # Example: $INDEXFILE='/usr/local/etc/httpd/index/index.idx'; $INDEXFILE='/hsphere/local/home/blather/blather.net/search/Index.idx'; # Location of the thesaurus data file: # Example: $thesfile='/igd/a3/home1/neuss/Perl/thes.dat'; #$thesfile='/igd/a3/home1/neuss/Perl/thes.dat'; # Document Root and Aliases for your server. The Document Root is # the directory where the "top level" documents reside. Additional # mappings can be set via the "Aliases" variable (which can be left # empty if no additional mappings exist). # # Important hint: if you are unsure about how to set $docroot, # look at the end of the index file. $docroot must be set so # that it matches the paths found there. # Any occurrence of $docroot in the filepath will be substituted # with a blank. # # Example # $docroot = '/usr3/webstuff/documents'; # %aliases = ( # '/projects', '/usr/stud/proj', # '/people', '/usr3/webstuff/staff', # ); # $docroot = '/hsphere/local/home/blather/blather.net/'; %aliases = ('http://www.blather.net/', '/hsphere/local/home/blather/blather.net/'); # The following configuration settings are OPTIONAL # HEADER and FOOTER make up the template page. # Split the template page where you want the # script to insert forms/results and call the # top 'header' and the bottom 'footer' $HEADER = 'header.txt'; $FOOTER = 'footer.txt'; # Maximum number of hits to return # Example: # $MAXHITS=100; # Minimum length of word to be indexed (same as in ice-idx.pl) # Example: # $MINLEN=3; $MINLEN=3; $TEMPLATE = 'test.html'; ###########--- end of configuration --- you don't have to change anything below ---######### # if this script is called up "by hand", run a test unless($ENV{"SCRIPT_NAME"}){ local($word) = ($#ARGV==-1) ? "test" : $ARGV[0] ; print "You have called the ice forms interface manually.\n"; print "Optionally, provide search word as an argument.\n"; print "Test mode: search for \"$word\"\n"; print "--------\n"; $orig="$word @ /"; $foo=&getquery($orig); print $foo; exit; } # do the real work, but trap any errors eval '&main'; sub main { # if content_length is zero and query string is empty if (($ENV{CONTENT_LENGTH}==0) && (length($ENV{"QUERY_STRING"})==0)){ # we're not decoding a form yet => send the form &send_header("$title"); &send_index(); &send_footer(); return; } # parse forms result and store it in an associative array %forms=&cgiparse(); &send_header("$title: ICE Query Result"); $query = $forms{KEYWORDS}; # remove non-word characters $query = &html2text($query); $query =~ tr/a-zA-Z\xc0-\xff0-9\-/ /cs; # remove leading and trailing whitespace $query =~ s/^\s*(.*\S)\s*/\1/; $pquery = $query; $context = $forms{CONTEXT}; if($context =~ m:\(([^)]*)\):) { $context=$1; }else{ $context=""; } $thesaurus = $forms{THESAURUS}; $substring = $forms{SUBSTRING}; $days = $forms{DAYS}; if(length($days)>0){ $pquery.=" -D $days"; } if(length($thesaurus)>0){ $pquery.=" -T"; } if(length($substring)>0){ $pquery.=" -S"; } if(length($context)>0){ $pquery.=" @ $context"; } if($forms{KEYWORDS}) {($err,$page) = &getquery($pquery);} else { print '
  • Please enter some keywords.'; &send_footer(); return undef; } if($err){ print "Query was: $query
    \n"; print "Problem: $err\n"; &send_footer(); return undef; } # print "Preferences set for this query:\n"; print "\n"; if($page){ print "

    The index contains the following\n"; print "items relevant to the query\n"; print "$page\n"; }else{ print "

    Nothing found.\n"; } &send_footer(); } # if an error has occured, log it to stdout if($@) { &send_header("Error in Script"); # just in case print "$@\n"; &send_footer(); } # print the CGI script header sub send_header { local($title)=@_; print "Content-type: text/html\n\n"; if($HEADER) {$header_dat = `cat $HEADER` || &Error("Couldn't get header $HEADER");} else {$header_dat = "\n$title\n\n\n

    $title

    \n";} print "$header_dat"; } sub send_footer { if($FOOTER) {$footer_dat = `cat $FOOTER` || &Error("Couldn't get footer $FOOTER");} else {$footer_dat = "";} print "$footer_dat"; } # display the Forms interface sub send_index { if($thesfile) { $THESAURUS_INSERT = '
  • Use Thesaurus to extend search
    '; } local($scriptname) = $ENV{"SCRIPT_NAME"}; print "
    \n"; print <<"END"; Find words:

    Tip: You can use "and" and "or" with search terms.
    Example: "cat or mouse".

    Specify options:
    END } # parse data from CGI request and store it as name/value pairs sub cgiparse { if ($ENV{'REQUEST_METHOD'} eq "POST") {read(STDIN, $buffer, $ENV{'CONTENT_LENGTH'});} else {$buffer = $ENV{'QUERY_STRING'};} local(@query_strings) = split("&", $buffer); foreach $q (@query_strings) { $q =~ s/\+/ /g; ($attr, $val) = split("=", $q); $val =~ s/%/\n%/g; local($tmpval); foreach (split("\n",$val)) { if(m:%(\w\w):) { local($binval) = hex($1); if(($binval>0)&&($binval<256)) { local($htmlval) = pack("C",$binval); s/%$1/$htmlval/; } } $tmpval .= $_; } $forms{$attr} = $tmpval; } %forms; } sub getquery { local($query)=@_; local($page)="";} else {$page = "";} ("",$page); # return; } # parse query sub parsequery{ local($query)=@_; local($context,$thesaurus,$substr); # preprocess whitespace and discard spaces after @ and -D $query =~ tr/ \t/ /s; $query =~ s/@ /@/g; $query =~ s/-D /-D/g; $query =~ s/^-D/ -D/g; $_=$query; # "optional URL context as @-sign" if(m:^([^@]*)\s+@(.*)$:){ $context=$2; $_=$1; } while(m:\s+-[SDT]\d*$:){ # "turn on "global" thesaurus" by adding -T" if(m:^(.*)\s+-T$:){ $thesaurus="y"; # print "turn on thesaurus\n"; $_=$1; } # "turn on matching substrings by adding -S" if(m:^(.*)\s+-S$:){ $substr="y"; # print "turn on substring matching\n"; $_=$1; } # "turn on modified since n days" by adding -D" if(m:^(.*)\s+-D(\d+)$:){ $days=$2; # print "turn on modified since $days\n"; $_=$1; } } @list=split(/ /,$_); $expectword="y" unless($days && $#list==-1); foreach $w (@list){ $_ = $w; tr/A-Z/a-z/; if(/^and$/) { if($expectword) {$err="$w"; last;} $expectword="y"; $bool .= "&"; }elsif(/^or$/){ if($expectword) {$err="$w"; last;} $expectword="y"; $bool .= "+"; }else{ unless($expectword) { $bool .= "&"; } $expectword=""; push(@querystring,$w); } } if($expectword){ print ("syntax error in query: must end with keyword!"); } if($err){ print ("syntax error in query near '$err'!"); } return($context,$thesaurus,$substr,$bool,$days,@querystring); } # get index entries matching query sub getindex { local($context,$thes,$substr,$bool,$days,@query) = &parsequery(@_); local(@list,$count,$item,$w,@wordnum,$grepexpr,$ret); local($limit,@allids,@keywords); if($days) {$limit=time()-(60*60*24*$days) unless($days==0);} foreach $item (@query){ ++$count; local($w); $_=$item; local($thesflag)=$thes; if (/{(.*)}/) { $_ = $1; $thesflag="y"; } # convert e.g. "Picture" to "picture" if(/^[A-Z][^A-Z]*$/){ tr/A-Z/a-z/; } # evaluate thesaurus if ($thesflag) { $wordnum{$_}=$count; local(@synonyms)=split(/\n/,&thesread($thesfile,$_)); foreach $w (@synonyms){ push (@keywords,$w); $wordnum{$w}=$count; } } if(length($_)<$MINLEN) { print "ignored: $_ (too short)
    \n"; push(@ignored,$count); }else{ $w=$_; push (@keywords,$w); $wordnum{$w}=$count; } } $grepexpr = join("|",@keywords); # trick for speedup: if no keywords, set grepexpr to "^--" $grepexpr = "^--" unless(@keywords > 0); if(@keywords <=0) {return;} local($timstr,$pat); open(FP,"$INDEXFILE") || die "$!"; while(){ if(/^--/o){ # seperator last; # break loop } next unless (/$grepexpr/o); foreach $w (@keywords){ $pat = $substr ? '\S*'.$w.'\S*' : $w; if(/^($pat)\s+(.*)$/){ $word=$1; @files=split(/ /,$2); foreach (@files){ if(/(.*):(.*)/ || /(.)(.*)/){ $freq=$1; $fileid=$2; } if (length($word)>0) { if (hex($fileid) != 0) { $token=$wordnum{$w}; $entry=join("\n",$fileid,$token,$word,$freq); push(@allids,$fileid); push(@list,$entry); } elsif ($word eq $w) { $token=$wordnum{$w}; print "ignored: $word (stopword)
    \n"; push(@ignored,$token); } } } } } } # step 2: read path information $grepexpr = join("|",@allids); $grepexpr = '\S+' unless (@keywords>0); # match all if none given local($name,$fileid); while(){ if(m:^(/.*):){ $dir = $1; } if(/^($grepexpr)\s+(.*) \/(\S+)\s+(.*)$/o){ $fileid = $1; $name = "$dir/$2"; $mtime = $3; $title = $4; # special case: no keywords -> get all files matching $limit if(@keywords == 0 && $mtime>=$limit){ $entry = join("\n",$name,"","","",$title,$mtime); push(@list,$entry); next; # continue loop } # if file doesn't match $limit if($limit != 0 && $mtime<$limit){ # remove it from list @list = grep(!/^$fileid\n/,@list); } # else replace fileid in @list with real path else{ foreach(@list){ if(/^$fileid\n/){ s/^$fileid\n/$name\n/; # replace id with real path $_ .= "\n$title"; # append title $_ .= "\n$mtime"; # append mod. time } } } } } close(FP); if($context){ # translate virtual<->physical path local($virt,$phys)=($context,&translate($context)); #&Debug("CONTEXT: virt:$virt phys:$phys"); # remove those paths that don't match context @list=grep(/$phys/,@list); } if($#keywords>=0){ # if keywords given evaluate expression @list=sort(@list); &evaluateexpr($bool,@list); #return }else{ # else just reorder foreach $w (@list){ ($path,$token,$word,$freq,$title,$time)=split(/\n/,$w); $w=join("\n","1",$path,$title,&timetostr($time)); } @list; } } sub evaluateexpr { local($bool,@list)=@_; local($lastpath,$lasttitle,$lasttime,$retval); local($relevance,%wordlist); local(@reslist); # let's do some initialisation ($path,$token,$word,$freq,$title,$mtime)=split(/\n/,$list[0]); $lastpath=$path; $lasttitle=$title; $lasttime=$mtime; %wordlist=(); local($fw); if($word ne $query[$token-1]){ $fw="$freq ($query[$token-1])"; }else{ $fw="$freq"; } $wordlist{$word}=$fw; foreach $i (0 .. $#query){ $exprarray[$i]=0; } # all stopwords are ignored (treated as matches) foreach (@ignored) { $exprarray[$_-1]=1; } # loop over all entries in list foreach $i (0 .. $#list+1){ # sic! if($i <= $#list){ ($path,$token,$word,$freq,$title,$mtime)=split(/\n/,$list[$i]); } # if path has changed -> compute expression if(($lastpath ne $path) || ($i==$#list+1)) { $expr=join('',@exprarray); ###print "path changed, call $exprarray ($expr) and ($bool)\n"; local($ret)=&booleval($bool,$expr); if($ret==1){ local($w); $retval .= "$relevance\n"; $retval .= "$lastpath\n"; $retval .= "$lasttitle\n"; $retval .= &timetostr($lasttime)."\n"; foreach $w (sort keys(%wordlist)){ $retval .= "\"$w\" $wordlist{$w}\n"; } push(@reslist,$retval); $retval=""; } if($i==$#list+1){ last; # leave loop } $lastpath=$path; $lasttitle=$title; $lasttime=$mtime; $relevance=$freq; %wordlist=(); if($word ne $query[$token-1]){ $fw="$freq ($query[$token-1])"; }else{ $fw="$freq"; } $wordlist{$word}=$fw; foreach $i (0 .. $#query){ $exprarray[$i]=0; } # all stopwords are ignored (treated as matches) foreach (@ignored) { $exprarray[$_-1]=1; } $exprarray[$token-1]=1; }else{ $exprarray[$token-1]=1; $relevance += $freq; if($word ne $query[$token-1]){ $fw="$freq ($query[$token-1])"; }else{ $fw="$freq"; } $wordlist{$word}=$fw; } } @reslist; # return; } # compute boolean expressions # e.g. to compute "1 or 0 and 1" use booleval("101","+&") sub booleval { local($arg1,$arg2)=@_; local($t1,$t2,$i,$op,$opers,$terms); @opers=split(//,$arg1); @terms=split(//,$arg2); $t1=$terms[0]; if($#terms==0){ # only one term given return $t1; } for $i (0..($#terms-1)){ $t2=$terms[$i+1]; $op=$opers[$i]; if($op eq "+"){ if($t1!=0){ return 1; } else { $t1=$t2; } }else{ $t1*=$t2; } } return $t1; } # evaulate a thesaurus file for a given term sub thesread { local($thesfile,$word)=@_; local($last,$result,$line)=""; local($allowed)="EQ|AB|UF"; unless (open(fpin,$thesfile)) { print STDERR "Cannot open thesaurus file $thesfile\n"; return undef; } while(){ $line++; if (m:^(\S+)\s+$:) { $last=$1; }elsif((m:^\s+($allowed)\s+(\S+):)&&($last eq $word)) { $result .= "$2\n"; } } close(fpin); $result; } # translate URL to physical sub translate { local($url)=@_; local($aliasdone); local($_)=$url; s|/+$||; # strip off a trailing "/" foreach $key (keys(%aliases)){ if( /^$key/ ){ s/^$key/$aliases{$key}/; $aliasdone="y"; #&Debug("replacing $key with $aliases{$key}"); } } if(!$aliasdone && $docroot){ $_ = $docroot.$_; } return $_; } # translate physical to URL sub translateback { local($url)=@_; local($aliasdone); local($_)=$url; s/(.*)\/$/\1/; # strip off a trailing "/" foreach $key (keys(%aliases)){ if(/^$aliases{$key}/){ s/^$aliases{$key}/$key/; $aliasdone="y"; #&Debug("replacing $aliases{$key} with $key"); } } if(!$aliasdone && $docroot) {s/$docroot//;} return $_; } # convert time to string sub timetostr { local($time)=@_; local($sec,$min,$hour,$mday,$mon,$year,$wday,@dontcare)=localtime($time); local($weekday)=(Sun,Mon,Tue,Wed,Thu,Fri,Sat)[$wday]; local($month)=(Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec)[$mon]; local($result)="$weekday $mday $month $year"; $result; } # iso2html - translate iso 8 bit characters to HTML # # Thanks to # Pierre Cormier (cormier.pierre@uqam.ca) # Universite du Quebec Montreal sub initTables { foreach (0..191) { $isohtml[$_] = pack("C",$_);} $isohtml[hex('c0')] = 'À'; $isohtml[hex('c1')] = 'Á'; $isohtml[hex('c2')] = 'Â'; $isohtml[hex('c3')] = 'Ã'; $isohtml[hex('c4')] = 'Ä'; $isohtml[hex('c5')] = 'Å'; $isohtml[hex('c6')] = 'Æ'; $isohtml[hex('c7')] = 'Ç'; $isohtml[hex('c8')] = 'È'; $isohtml[hex('c9')] = 'É'; $isohtml[hex('ca')] = 'Ê'; $isohtml[hex('cb')] = 'Ë'; $isohtml[hex('cc')] = 'Ì'; $isohtml[hex('cd')] = 'Í'; $isohtml[hex('ce')] = 'Î'; $isohtml[hex('cf')] = 'Ï'; $isohtml[hex('d0')] = 'Ð'; $isohtml[hex('d1')] = 'Ñ'; $isohtml[hex('d2')] = 'Ò'; $isohtml[hex('d3')] = 'Ó'; $isohtml[hex('d4')] = 'Ô'; $isohtml[hex('d5')] = 'Õ'; $isohtml[hex('d6')] = 'Ö'; $isohtml[hex('d7')] = '×'; $isohtml[hex('d8')] = 'Ø'; $isohtml[hex('d9')] = 'Ù'; $isohtml[hex('da')] = 'Ú'; $isohtml[hex('db')] = 'Û'; $isohtml[hex('dc')] = 'Ü'; $isohtml[hex('dd')] = 'Ý'; $isohtml[hex('de')] = 'Þ'; $isohtml[hex('df')] = 'ß'; $isohtml[hex('e0')] = 'à'; $isohtml[hex('e1')] = 'á'; $isohtml[hex('e2')] = 'â'; $isohtml[hex('e3')] = 'ã'; $isohtml[hex('e4')] = 'ä'; $isohtml[hex('e5')] = 'å'; $isohtml[hex('e6')] = 'æ'; $isohtml[hex('e7')] = 'ç'; $isohtml[hex('e8')] = 'è'; $isohtml[hex('e9')] = 'é'; $isohtml[hex('ea')] = 'ê'; $isohtml[hex('eb')] = 'ë'; $isohtml[hex('ec')] = 'ì'; $isohtml[hex('ed')] = 'í'; $isohtml[hex('ee')] = 'î'; $isohtml[hex('ef')] = 'ï'; $isohtml[hex('f0')] = 'ð'; $isohtml[hex('f1')] = 'ñ'; $isohtml[hex('f2')] = 'ò'; $isohtml[hex('f3')] = 'ó'; $isohtml[hex('f4')] = 'ô'; $isohtml[hex('f5')] = 'õ'; $isohtml[hex('f6')] = 'ö'; $isohtml[hex('f7')] = '&DIVIS;'; $isohtml[hex('f8')] = 'ø'; $isohtml[hex('f9')] = 'ù'; $isohtml[hex('fa')] = 'ú'; $isohtml[hex('fb')] = 'û'; $isohtml[hex('fc')] = 'ü'; $isohtml[hex('fd')] = 'ý'; $isohtml[hex('fe')] = 'þ'; $isohtml[hex('ff')] = 'ÿ'; # preset iso2text variable settings foreach (0..191) { $iso2text[$_] = pack("C",$_);} foreach (hex('c0')..hex('ff')) {$iso2text[$_] = substr($isohtml[$_],1,1);} # now assign exceptions: $iso2text[hex('c4')] = 'Ae'; $iso2text[hex('c6')] = 'AE'; $iso2text[hex('d0')] = 'ETH'; # ??? $iso2text[hex('d6')] = 'Oe'; $iso2text[hex('d7')] = 'x'; $iso2text[hex('dc')] = 'Ue'; $iso2text[hex('de')] = 'Th'; # thorn ??? $iso2text[hex('df')] = 'sz'; $iso2text[hex('e4')] = 'ae'; $iso2text[hex('e6')] = 'ae'; $iso2text[hex('f7')] = 'D'; # Divis? $iso2text[hex('fc')] = 'ue'; $iso2text[hex('fe')] = 'th'; # thorn # set html2iso variable foreach (1..255) {$html2iso{$isohtml[$_]}=pack("C",$_);;} } sub iso2html { local($input)=@_; unless(defined($isohtml[0])) {&initTables;} local(@car) = split(//,$input); local($output); foreach (@car) {$output .= $isohtml[ord($_)];} $output; } sub iso2text { local($input)=@_; unless(defined($isohtml[0])) {&initTables;} local(@car) = split(//,$input); local($output); foreach (@car) {$output .= $iso2text[ord($_)];} $output; } sub html2iso { local($input)=@_; unless(defined($isohtml[0])) {&initTables;} local(@car) = split(/;/,$input); local($output); foreach (@car) { if(/(.*)&(.*)/) { $output .= $1; $output .= $html2iso{"&$2;"}; } else {$output .= $_;} } $output; } sub html2text { return &iso2text(&html2iso(@_)); } sub Debug { # DUMPS A STRING TO A BROWSER AS HTML if(!$DEBUG_USED) {print "Content-type: text/html\n\n";} $DEBUG_USED = 1; print "
  • $_[0]"; } sub Error { # RETURNS A USER LEVEL ERROR MESSAGE WITH THE SYSTEM HEADER AND FOOTERS print "Content-type: text/html\n\n"; print"

    Error

    $_[0]"; exit; }