=head1 NAME

OpenDirectory -- Match text/urls against the OpenDirectory /
dmoz.org

=head1 DESCRIPTION

This is a set of classes to match text/urls against the
OpenDirectory / dmoz.org categories.

Matches can currently be performed using a simple word
frequency match (OpenDirectory::Matcher::Word) or by quering
Google (OpenDirectory::Matcher::Google).

The match operation produces generic
OpenDirectory::MatchResult objects containing the matched
OpenDirectory::Category objects (with the match score).

OpenDirectory::MatchResult objects also know how to
structure its contents for a useful presentation, apart
from the basic list.


=head2 Word frequency data

Word frequency data for the OpenDirectoy categories was
gathered by mining sites linked in the OpenDirectory. About
a millon URLs and 10G HTML.


=head2 Class overview

Ignore this module for awhile and look at the POD for the 
following classes instead:

	OpenDirectory::Category

	OpenDirectory::Matcher
	OpenDirectory::Matcher::Word
	OpenDirectory::Matcher::Google

	OpenDirectory::MatchResult

=head1 VERSION

This is alpha software. It works, but it's not properly
refactored from the prototype stage yet. The interface is
likely to change, especially the utility routines in this
module which are mostly leftovers.

=cut





package OpenDirectory;

$VERSION = '0.0.1';





use strict;
use Carp qw( confess );

use Data::Dumper;
use IO;

use LWP::Simple;
use LWP::UserAgent;
use HTTP::Request::Common qw(GET POST);
use HTML::Entities;
use HTML::Parse;

use File::Find;





my $charNoise = quotemeta(join("", map { chr($_) } 0..ord("0")) . q{:;<=>?@[\]^`{|}~€‚ƒ„…†‡ˆ‰‘’“”•–—˜™› ¦§¨©«¬­¯°±²³´µ¶·¸¹º»¼½¾¿£});





=head1 PROPERTIES

=head2 pathData

The path to the dir for storing processed data.

Default: "./";

=cut
sub pathData { my $self = shift; my $pkg = ref($self);
	my ($val) = @_;

	if(defined($val)) {
		$self->{pathData} = $val;
		}

	return($self->{pathData});
	}





=head1 METHODS

=head2 new()

Create new object.

=cut
sub new { my $pkg = shift; $pkg = ref($pkg) || $pkg;

	my $self = {
		};
	bless $self, $pkg;

	$self->pathData(".");


	return($self);
	}





=head2 raDataFileFind([$noMax = 0])

Return array ref with relative file names from the pathData()
directory.

$noMax -- The maximum number of files returned (0 means no
limit).

Return [] on errors.

=cut
sub raDataFileFind { my $self = shift; my $pkg = ref($self);
	my ($noMax) = @_;
	$noMax ||= 0;

	my $no = 0;
	my @aFile;
	eval {
		find( {
				wanted => sub {
					if(/\.txt(\.gz)?$/) {
#					if(/\.txt$/) {
						die() if($noMax && ($no++ >= $noMax));
						push(@aFile, $File::Find::name);
						};
					},
				no_chdir => 1,
				},
			$self->pathData());
		};

	return(\@aFile);
	}





=head2 raUrlFromCategory($topic)

Return array ref with URLs from the category $topic (e.g.
"Top/Computers"). Fetch the categories from http://dmoz.org.

Return undef on errors.

=cut
sub raUrlFromCategory { my $self = shift; my $pkg = ref($self);
	my ($topic) = @_;
	$topic =~ s/^Top//;

	my $url = "http://dmoz.org$topic/";
	my $html = get($url) or return(undef);
	my @aUrl = ($html =~ m{<li><a href="(https?://[^"]+)">}sg);

	return(\@aUrl);
	}





=head2 loadFile($file)

Return contents of $file, or "" on errors.

=cut
sub loadFile {my $self = shift; my $pkg = ref($self);
	my ($file) = @_;

	open(my $fh, "< $file") or return("");
	local $/;
	defined(my $text = <$fh>) or return("");
	close($fh);

	return($text);
	}





=head2 saveFile($file, $text)

Write $text to $file.

Return 1, or 0 on errors.

=cut
sub saveFile {my $self = shift; my $pkg = ref($self);
	my ($file, $text) = @_;

	my $fh;
	open($fh, "> $file") && $fh->print($text) && close($fh) or return(0);

	return(1);
	}





=head2 textTitleSizeFromUrl($url)

Return array with ($text, $title, $size) from $url.

$text -- Plain text rendering (no layout) of the HTML page.

$title -- HTML title

$size -- size in bytes of the HTML page

Return ("", "", 0) on errors, ("", "", -1) on fatal errors.

=cut
sub textTitleSizeFromUrl { my $self = shift; my $pkg = ref($self);
	my ($url) = @_;

	##Get web page
	#Don't use LWP::Simple, it will hang on timeouts
	my $ua = LWP::UserAgent->new();
	$ua->agent("Mozilla/5.0 (compatible; MSIE 5.0; Windows NT/95/98)");
	my $req = GET($url);
	my $res = $ua->request($req);
	return("", "", -1) if(!$res->is_success());
	my $html = $res->content();
	my $size = length($html);

	#Is there no <html> start tag at all? Let's not touch it..
	return("", "", $size) if($html !~ /<\s*html\s*>/is);



	#Extract title
	my $title = "";
	if($html =~ m{<title>(.*?)</title>}si) {
		$title = $1;
		}


	##Extract all text without tags
	my $objParser = parse_html($html);

	my $text = "";
	my @aNodeToDestroy;		#Manually break circular refs so perl can garbage collect properly
	$objParser->traverse(
		sub {
			my($node) = @_;

			if(!ref $node) { #Simple text content
				$text .= encode_entities($node);
				}
			else {
				push(@aNodeToDestroy, $node);	#Collect refs
				}

			return(1);
			}
		);
	$text = decode_entities($text);
	$text =~ s{[\s\n\xA0]+}{ }gs;

	$_->parent(0) for(@aNodeToDestroy);			#Break refs

	return($text, $title, $size);
	}





=head2 rhWordFromText($text, [$rhWord])

Extract words from $text and count them. Add to $rhWord
(key: word, value: count).

$rhWord is both modified in-place and returned.

=cut
sub rhWordFromText { my $self = shift; my $pkg = ref($self);
	my ($text, $rhWord) = @_;
	defined($rhWord) or $rhWord = {};		#Optional, start with empty if not passed

	for my $word (split(/[ ._(),\/]+/, $text)) {
		next if($word =~ /^href=/i);

		$word =~ s{[$charNoise]+}{}gs;				#Remove noise
		next if($word =~ m{^\s*$});					#Skip empty and whitespace only

		$rhWord->{lc($word)}++;
		}

	return($rhWord);
	}





=head2 longestCommonString($first, $second)

Extract words from $text and count them. Add to $rhWord
(key: word, value: count).

$rhWord is both modified in-place and returned.

=cut
sub longestCommonString { my $self = shift; my $pkg = ref($self);
	my ($first, $second) = @_;

	#Shortest string
	my $len = length($first) < length($second) ? length($first) : length($second);

	while(length($first) && $len >= 0 && $first ne $second) {
		$first = substr($first, 0, $len);
		$second = substr($second, 0, $len);
		$len--;
		}

	return($first);
	}





=head1 BUGS

Maybe. Maybe not. Probably.


=head1 AUTHOR

Johan Lindström - johanl@bahnhof.se

Copyright (c) 2002.. Johan Lindström. All rights reserved.
This program is free software; you can redistribute it
and/or modify it under the same terms as Perl itself.

=cut





1;





#EOF
