#!/usr/bin/perl -w
package SwishSearch;
use strict;
# This is set to where Swish-e's "make install" installed the helper modules.
use lib ( '/usr/local/lib/swish-e/perl' );
my $DEFAULT_CONFIG_FILE = '.swishcgi.conf';
###################################################################################
#
# If this text is displayed on your browser then your web server
# is not configured to run .cgi programs. Contact your web server administrator.
#
# To display documentation for this program type "perldoc swish.cgi"
#
# swish.cgi $Revision$ Copyright (C) 2001 Bill Moseley swishscript@hank.org
# Example CGI program for searching with SWISH-E
#
# This example program will only run under an OS that supports fork().
# Under windows it uses a piped open which MAY NOT BE SECURE.
#
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version
# 2 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# The above lines must remain at the top of this program
#
# $Id$
#
####################################################################################
# This is written this way so the script can be used as a CGI script or a mod_perl
# module without any code changes.
# use CGI (); # might not be needed if using Apache::Request
#=================================================================================
# CGI entry point
#
#=================================================================================
use vars '$speedy_config'; # Global for caching in persistent environment such as SpeedyCGI
# Run the script -- entry point if running as a CGI script
unless ( $ENV{MOD_PERL} ) {
if ( !$speedy_config ) {
$speedy_config = default_config();
# Merge with disk config file.
$speedy_config = merge_read_config( $speedy_config );
}
process_request( $speedy_config );
}
#==================================================================================
# This sets the default configuration parameters
#
# Any configuration read from disk is merged with these settings.
#
# Only a few settings are actually required. Some reasonable defaults are used
# for most. If fact, you can probably create a complete config as:
#
# return = {
# swish_binary => '/usr/local/bin/swish-e',
# swish_index => '/usr/local/share/swish/index.swish-e',
# title_property => 'swishtitle', # Not required, but recommended
# };
#
# But, that doesn't really show all the options.
#
# You can modify the options below, or you can use a config file. The config file
# is .swishcgi.conf by default (read from the current directory) that must return
# a hash reference. For example, to create a config file that changes the default
# title and index file name, plus uses Template::Toolkit to generate output
# create a config file as:
#
# # Example config file -- returns a hash reference
# return {
# title => 'Search Our Site',
# swish_index => 'index.web',
#
# template => {
# package => 'SWISH::TemplateToolkit',
# file => 'swish.tt',
# options => {
# INCLUDE_PATH => '/home/user/swish-e/example',
# },
# },
# };
#
#
#-----------------------------------------------------------------------------------
sub default_config {
##### Configuration Parameters #########
#---- This lists all the options, with many commented out ---
# By default, this config is used -- see the process_request() call below.
# You should adjust for your site, and how your swish index was created.
##>>
##>> Please don't post this entire section on the swish-e list if looking for help!
##>>
##>> Send a small example, without all the comments.
#======================================================================
# *** NOTES ****
# Items beginning with an "x" or "#" are commented out
# the "x" form simply renames (hides) that setting. It's used
# to make it easy to disable a mult-line configuation setting.
#
# If you do not understand a setting then best to leave the default.
#
# Please follow the documentation (perldoc swish.cgi) and set up
# a test using the defaults before making changes. It's much easier
# to modify a working example than to try to get a modified example to work...
#
# Again, this is a Perl hash structure. Commas are important.
#======================================================================
return {
title => 'Search our site', # Title of your choice. Displays on the search page
swish_binary => '/usr/local/bin/swish-e', # Location of swish-e binary
# By default, this script tries to read a config file. You should probably
# comment this out if not used save a disk stat
config_file => $DEFAULT_CONFIG_FILE, # Default config file
# The location of your index file. Typically, this would not be in
# your web tree.
# If you have more than one index to search then specify an array
# reference. e.g. swish_index =>[ qw( index1 index2 index3 )],
swish_index => 'index.swish-e', # Location of your index file
# See "select_indexes" below for how to
# select more than one index.
page_size => 15, # Number of results per page - default 15
# prepend this path to the filename (swishdocpath) returned by swish. This is used to
# make the href link back to the original document. Comment out to disable.
#prepend_path => 'http://localhost/mydocs',
# This is the property that is used for the href link back to the original
# document. It's "swishdocpath" by default
#link_property => 'swishdocpath',
## Display properties ##
# Everything swish records about a file is called a "property". These
# next three settings tell the swish.cgi script which properties should be passed
# to the templating coded for output generation.
# First is the property name to use as the main link text to the indexed document.
# Typically, this will be 'swishtitle' if have indexed html documents,
# but you can specify any PropertyName defined in your document.
# By default, swish will display the pathname for documents that do not
# have a title.
# In other words, this is used for the text of the links of the search results.
# title_property
title_property => 'swishtitle',
# Swish has a configuration directive "StoreDescription" that will save part or
# all of a document's contents in the index file. This can then be displayed
# along with results. If you are indexing a lot of files this can use a lot of disk
# space, so test carefully before indexing your entire site.
# Building swish with zlib can greatly reduce the space used by StoreDescription.
#
# This settings tells this script to display this property as the description.
# Normally, this should be 'swishdescription', but you can specify another property name.
# There is no default.
description_prop => 'swishdescription',
# Property names listed here will be displayed in a table below each result
# You may wish to modify this list if you are using document properties (PropertyNames)
# in your swish-e index configuration
# There is no default.
display_props => [qw/swishlastmodified swishdocsize swishdocpath/],
# Results can be sorted by any of the properties listed here
# They will be displayed in a drop-down list on the form.
# You may modify this list if you are using document properties of your own creation
# Swish uses the rank as the default sort
sorts => [qw/swishrank swishlastmodified swishtitle swishdocpath/],
# Secondary_sort is used to sort within a sort
# You may enter a property name followed by a direction (asc|desc)
secondary_sort => [qw/swishlastmodified desc/],
# You can limit by MetaNames here. Names listed here will be displayed in
# a line of radio buttons.
# The default is to not allow any metaname selection.
# To use this feature you must define MetaNames while indexing.
# The special "swishdefault" says to search any text that was not indexed
# as a specific metaname (e.g. typically the body of a HTML document and its title).
# To see how this might work, add to your *swish-e* config file:
# MetaNames swishtitle swishdocpath
# reindex and try:
metanames => [qw/ swishdefault swishtitle swishdocpath /],
# Add "all" to this list to test the meta_groups feature described below
# Another example: if you indexed an email archive
# that defined the metanames subject name email (as in the swish-e discussion archive)
# you might use:
#metanames => [qw/body subject name email/],
# Searching multiple meta names:
# You can also group metanames into "meta-metanames".
# Example: Say you defined metanames "author", "comment" and "keywords"
# You want to allow searching "author", "comment" and the document body ("swishdefault")
# But you would also like an "all" search that searches all metanames, including "keywords":
#
# metanames => [qw/swishdefault author comment all/],
#
# Now, the "all" metaname is not a real metaname. It must be expanded into its
# individual metanames using meta_groups:
#
# "meta_groups" maps a fake metaname to a list of real metanames
#
# meta_groups => {
# all => [qw/swishdefault author comment keywords / ],
# },
#
# swish.cgi will then take a query like
#
# all=(query words)
#
# and create the query
#
# swishdefault=(query words) OR author=(query words) OR comment=(query words) OR keywords=(query words)
#
# This is not ideal, but should work for most cases
# (might fail under windows since the query is passed through the shell).
# To enable this group add "all" to the list of metanames above
meta_groups => {
all => [qw/swishdefault swishtitle swishdocpath/],
},
# Note that you can use other words than "all". The script just checks if a given metaname is
# listed in "meta_groups" and expands as needed.
# "name_labels" is used to map MetaNames and PropertyNames to user-friendly names
# on the CGI form.
name_labels => {
swishdefault => 'Title & Body',
swishtitle => 'Title',
swishrank => 'Rank',
swishlastmodified => 'Last Modified Date',
swishdocpath => 'Document Path',
swishdocsize => 'Document Size',
all => 'All', # group of metanames
subject => 'Message Subject', # other examples
name => "Poster's Name",
email => "Poster's Email",
sent => 'Message Date',
},
timeout => 10, # limit time used by swish when fetching results - DoS protection.
# does not work under Windows
max_query_length => 100, # limit length of query string. Swish also has a limit (default is 40)
# You might want to set swish-e's limit higher, and use this to get a
# somewhat more friendly message.
max_chars => 500, # Limits the size of the description_prop if it is not highlighted
# This structure defines term highlighting, and what type of highlighting to use
# If you are using metanames in your searches and they map to properties that you
# will display, you may need to adjust the "meta_to_prop_map".
highlight => {
# Pick highlighting module -- you must make sure the module can be found
# The highlighting modules are in the example/modules directory by default
# Ok speed, but doesn't handle phrases or stopwords
# Deals with stemming, and shows words in context
# Takes into consideration WordCharacters, IgnoreFirstChars and IgnoreLastChars.
#package => 'SWISH::DefaultHighlight',
# Somewhat slow, but deals with phases, stopwords, and stemming.
# Takes into consideration WordCharacters, IgnoreFirstChars and IgnoreLastChars.
package => 'SWISH::PhraseHighlight',
# Faster: phrases without regard to wordcharacter settings
# doesn't do context display, so must match in first X words, so may not even highlight
# doesn't handle stemming or stopwords.
#package => 'SWISH::SimpleHighlight',
show_words => 10, # Number of "swish words" words to show around highlighted word
max_words => 100, # If no words are found to highlighted then show this many words
occurrences => 6, # Limit number of occurrences of highlighted words
highlight_on => '', # HTML highlighting codes
highlight_off => '',
#highlight_on => '',
#highlight_off => '',
# This maps (real) search metatags to display properties.
# e.g. if searching in "swishdefault" then highlight in the
# swishtitle and swishdescription properties
# Do not include "fake" metanames defined with meta_groups, just
# list the real metanames used in your index, and the properties they
# relate to.
meta_to_prop_map => {
swishdefault => [ qw/swishtitle swishdescription/ ],
swishtitle => [ qw/swishtitle/ ],
swishdocpath => [ qw/swishdocpath/ ],
},
},
# If you specify more than one index file (as an array reference) you
# can set this allow selection of which indexes to search.
# The default is to search all indexes specified if this is not used.
# When used, the first index is the default index.
# You need to specify your indexes as an array reference:
#swish_index => [ qw/ index.swish-e index.other index2.other index3.other index4.other / ],
Xselect_indexes => {
# pick radio_group, popup_menu, or checkbox_group
method => 'checkbox_group',
#method => 'radio_group',
#method => 'popup_menu',
columns => 3,
# labels must match up one-to-one with elements in "swish_index"
labels => [ 'Main Index', 'Other Index', qw/ two three four/ ],
description => 'Select Site: ',
# Optional - Set the default index if none is selected
# This needs to be an index file name listed in swish_index
# above, not a label
default_index => '',
},
# Similar to select_indexes, this adds a metaname search
# based on a metaname. You can use any metaname, and this will
# add an "AND" search to limit results to a subset of your records.
# i.e. it adds something like 'site=(foo or bar or baz)' if foo, bar, and baz were selected.
# This really just allows you to limit existing searches by a metaname, instead of
# selecting a metaname (with metanames option above).
# Swish-e's ExtractPath would work well with this. For example,
# to allow limiting searches to specific sections of the apache docs use this
# in your swish-e config file:
# ExtractPath site regex !^/usr/local/apache/htdocs/manual/([^/]+)/.+$!$1!
# ExtractPathDefault site other
# which extracts the segment of the path after /manual/ and indexes that name
# under the metaname "site". Then searches can be limited to files with that
# path (e.g. query would be swishdefault=foo AND site=vhosts to limit searches
# to the virtual host section.
Xselect_by_meta => {
#method => 'radio_group', # pick: radio_group, popup_menu, or checkbox_group
method => 'checkbox_group',
#method => 'popup_menu',
columns => 3,
metaname => 'site', # Can't be a metaname used elsewhere!
values => [qw/misc mod vhosts other/],
labels => {
misc => 'General Apache docs',
mod => 'Apache Modules',
vhosts => 'Virtual hosts',
},
description => 'Limit search to these areas: ',
},
# The 'template' setting defines what generates the output
# The default is "TemplateDefault" which is reasonably ugly,
# but does not require installation of a separate templating system.
# Note that some of the above options may not be available
# for templating, as it's up to you to layout the form
# and swish-e results in your template.
# TemplateDefault is the default
xtemplate => {
package => 'SWISH::TemplateDefault',
},
xtemplate => {
package => 'SWISH::TemplateDumper',
},
xtemplate => {
package => 'SWISH::TemplateToolkit',
file => 'swish.tt',
options => {
INCLUDE_PATH => '/usr/local/share/swish-e',
#PRE_PROCESS => 'config',
},
},
xtemplate => {
package => 'SWISH::TemplateHTMLTemplate',
options => {
filename => 'swish.tmpl',
path => '/usr/local/share/swish-e',
die_on_bad_params => 0,
loop_context_vars => 1,
cache => 1,
},
},
# The "on_intranet" setting is just a flag that can be used to say you do
# not have an external internet connection. It's here because the default
# page generation includes links to images on swish-e.or and on www.w3.org.
# If this is set to one then those images will not be shown.
# (This only effects the default ouput module SWISH::TemplateDefault)
on_intranet => 0,
# Here you can hard-code debugging options. The will help you find
# where you made your mistake ;)
# Using all at once will generate a lot of messages to STDERR
# Please see the documentation before using these.
# Typically, you will set these from the command line instead of in the configuration.
# debug_options => 'basic, command, headers, output, summary, dump',
# This defines the package object for reading CGI parameters
# Defaults to CGI. Might be useful with mod_perl.
# request_package => 'CGI',
# request_package => 'Apache::Request',
# use_library => 1, # set true and will use the SWISH::API module
# will cache based on index files when running under mod_perl
# Minor adjustment to page display. The page navigation normally looks like:
# Page: 1 5 6 7 8 9 24
# where the first page and last page are always displayed. These can be disabled by
# by setting to true values ( 1 )
no_first_page_navigation => 0,
no_last_page_navigation => 0,
num_pages_to_show => 12, # number of pages to offer
# Limit to date ranges
# This adds in the date_range limiting options
# You will need the DateRanges.pm module from the author to use that feature
# Noramlly, you will want to limit by the last modified date, so specify
# "swishlastmodified" as the property_name. If indexing a mail archive, and, for
# example, you store the date (a unix timestamp) as "date" then specify
# "date" as the property_name.
date_ranges => {
property_name => 'swishlastmodified', # property name to limit by
# what you specify here depends on the DateRanges.pm module.
time_periods => [
'All',
'Today',
'Yesterday',
#'Yesterday onward',
'This Week',
'Last Week',
'Last 90 Days',
'This Month',
'Last Month',
#'Past',
#'Future',
#'Next 30 Days',
],
line_break => 0,
default => 'All',
date_range => 1,
},
# This is suppose to reduce the load on systems if hit with a large number
# of requests. Although this will limit the number of swish-e processes run
# it will not limit the number of CGI requests. I feel like a better solution
# is to use mod_perl (with the SWISH::API module).
# I also think that running /bin/ps for every is not ideal.
# This only works on unix-based systems when running the swish-e binary.
# It greps /swish-e/ from the output of ps and aborts if the count is < limit_procs
# Set max number of swish-e binaries and ps command to run
limit_procs => 0, # max number of swish process to run (zero to not limit)
ps_prog => '/bin/ps -Unobody -ocommand', # command to list number of swish binaries
};
}
#^^^^^^^^^^^^^^^^^^^^^^^^^ end of user config ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#========================================================================================
#=================================================================================
# mod_perl entry point
#
# As an example, you might use a PerlSetVar to point to paths to different
# config files, and then cache the different configurations by path.
#
#=================================================================================
my %cached_configs;
sub handler {
my $r = shift;
if ( my $config_path = $r->dir_config( 'Swish_Conf_File' ) ) {
# Already cached?
# Note that this is cached for the life of the server -- must restart if want to change config
if ( $cached_configs{ $config_path } ) {
process_request( $cached_configs{ $config_path } );
return Apache::Constants::OK();
}
# Else, load config
my $config = default_config();
$config->{config_file} = $config_path;
# Merge with disk config file.
$cached_configs{ $config_path } = merge_read_config( $config );
process_request( $cached_configs{ $config_path } );
return Apache::Constants::OK();
}
# Otherwise, use hard-coded config
my $config = default_config();
# Merge with disk config file.
$config = merge_read_config( $config );
process_request( default_config() );
return Apache::Constants::OK();
}
#============================================================================
# Read config settings from disk, and merge
# Note, all errors are ignored since by default this script looks for a
# config file.
#
#============================================================================
sub merge_read_config {
my $config = shift;
set_default_debug_flags();
set_debug($config); # get from config or from %ENV
return $config unless $config->{config_file};
my $return = do $config->{config_file}; # load the config file
unless ( ref $return eq 'HASH' ) {
# First, let's check for file not found for the default config, which we can ignore
my $error = $@ || $!;
if ( $config->{config_file} eq $DEFAULT_CONFIG_FILE && !-e $config->{config_file} ) {
warn "Config file '$config->{config_file}': $!" if $config->{debug};
return $config;
}
die "Config file '$config->{config_file}': $error";
}
if ( $config->{debug} || $return->{debug} ) {
require Data::Dumper;
print STDERR "\n---------- Read config parameters from '$config->{config_file}' ------\n",
Data::Dumper::Dumper($return),
"-------------------------\n";
}
set_debug( $return );
# Merge settings
return { %$config, %$return };
}
#--------------------------------------------------------------------------------------------------
sub set_default_debug_flags {
# Debug flags defined
$SwishSearch::DEBUG_BASIC = 1; # Show command used to run swish
$SwishSearch::DEBUG_COMMAND = 2; # Show command used to run swish
$SwishSearch::DEBUG_HEADERS = 4; # Swish output headers
$SwishSearch::DEBUG_OUTPUT = 8; # Swish output besides headers
$SwishSearch::DEBUG_SUMMARY = 16; # Summary of results parsed
$SwishSearch::DEBUG_RESULTS = 32; # Detail of results parsed
$SwishSearch::DEBUG_DUMP_DATA = 64; # dump data that is sent to templating modules
}
#---------------------------------------------------------------------------------------------------
sub set_debug {
my $conf = shift;
$conf->{debug} = 0;
my $debug_string = $ENV{SWISH_DEBUG} ||$conf->{debug_options};
return unless $debug_string;
my %debug = (
basic => [$SwishSearch::DEBUG_BASIC, 'Basic debugging'],
command => [$SwishSearch::DEBUG_COMMAND, 'Show command used to run swish'],
headers => [$SwishSearch::DEBUG_HEADERS, 'Show headers returned from swish'],
output => [$SwishSearch::DEBUG_OUTPUT, 'Show output from swish'],
summary => [$SwishSearch::DEBUG_SUMMARY, 'Show summary of results'],
results => [$SwishSearch::DEBUG_RESULTS, 'Show detail of results'],
dump => [$SwishSearch::DEBUG_DUMP_DATA, 'Show all data available to templates'],
);
$conf->{debug} = 1;
my @debug_str;
for ( split /\s*,\s*/, $debug_string ) {
if ( exists $debug{ lc $_ } ) {
push @debug_str, lc $_;
$conf->{debug} |= $debug{ lc $_ }->[0];
next;
}
print STDERR "Unknown debug option '$_'. Must be one of:\n",
join( "\n", map { sprintf(' %10s: %10s', $_, $debug{$_}->[1]) } sort { $debug{$a}->[0] <=> $debug{$b}->[0] }keys %debug),
"\n\n";
exit;
}
print STDERR "Debug level set to: $conf->{debug} [", join( ', ', @debug_str), "]\n";
}
#============================================================================
#
# This is the main controller (entry point), where a config hash is passed in.
#
# Loads the request module (e.g. CGI.pm), and the output module
# Also sets up debugging
#
#============================================================================
sub process_request {
my $conf = shift; # configuration parameters
# Limit number of requests - questionable value
limit_swish( $conf->{limit_procs}, $conf->{ps_prog} )
if !$conf->{use_library}
&& $conf->{limit_procs} && $conf->{limit_procs} =~ /^\d+$/
&& $conf->{ps_prog};
# Set default property used or the href link to the document
$conf->{link_property} ||= 'swishdocpath';
# Use CGI.pm by default
my $request_package = $conf->{request_package} || 'CGI';
load_module( $request_package );
my $request_object = $request_package->new;
# load the templating module
my $template = $conf->{template} || { package => 'SWISH::TemplateDefault' };
load_module( $template->{package} );
# Allow fixup within the config file
if ( $conf->{request_fixup} && ref $conf->{request_fixup} eq 'CODE' ) {
&{$conf->{request_fixup}}( $request_object, $conf );
}
set_debug_input( $conf, $request_object )
if $conf->{debug} && !$ENV{GATEWAY_INTERFACE};
# Create search object and build a query based on CGI parameters
my $search = SwishQuery->new(
config => $conf,
request => $request_object,
);
# run the query (run if there's a query)
$search->run_query; # currently, results is the just the $search object
if ( $search->hits ) {
$search->set_navigation; # sets links
}
show_debug_output( $conf, $search )
if $conf->{debug};
$template->{package}->show_template( $template, $search );
}
# For limiting number of swish-e binaries
sub limit_swish {
my ( $limit_procs, $ps_prog ) = @_;
my $num_procs = scalar grep { /swish-e/ } `$ps_prog`;
return if $num_procs <= $limit_procs;
warn "swish.cgi - limited due to too many currently running swish-e binaries: $num_procs running is more than $limit_procs\n";
## Abort
print <
Too Many Requests
Too Many Requests -- Try again later