#!/usr/bin/perl -w
#
# wwwoffle-extract
#
# Developed by Peter Rockai <yenar@platon.sk>
# Copyright (c) 2003 Platon SDG, http://platon.sk/
# Licensed under terms of GNU General Public License.
# All rights reserved.
#
# Changelog:
# 2002 - created
# 04/08/2003 - support for entering directory on command line
# (by Ondrej Jombik)
#
# $Platon: $
if ($#ARGV != 0) {
printf "Usage: %s <directory>\n", $0;
exit 1;
}
sub in_array
{
my $val = shift;
for (my $i = 0; $i < scalar(@_); $i++) {
return 1 if ($_[$i] eq $val);
}
return 0;
}
$dir = $ARGV[0];
$dir =~ s/\/*$//g;
$out = $dir;
$out =~ s/^.*\/([^\/]+)$/$1/g;
@ls = split (/\n/, ` ls "$dir/"U* `);
@dirs = ();
for $id (@ls) {
$id =~ s/^.*\/U([^\/]+)$/$1/g;
$f = ` cat "$dir/U$id" `;
# strip "protocol://server/" part of URL
$f =~ s!^(.*?//.*?/)!!;
$server = $1;
print "Extracting $f\n";
# create directory tree
$newdir = $f;
$newdir =~ s/^(.*\/)[^\/]*$/$1/g;
if (! in_array($newdir, @dirs)) {
print "Creating $out/$newdir\n";
push @dirs, $newdir if (! system("mkdir -p '$out/$newdir'"));
}
$data = ` cat "$dir/D$id" `;
# strip HTTP header
$data =~ s/^.*?\n\x0d?\n\x0d?//ms;
$td = "/$newdir/";
1 while $td =~ s!(?<=/)[^.]+?(?=/)!..!g;
$td =~ s/\n$//;
$td =~ s!^/!!;
$td =~ s/^\/$//;
# convert URL's
$data =~ s!$server!$td!g;
# remove remote images
$data =~ s!<[iI][mM][gG](.*?)[sS][rR][cC]="?.*?://"?.*?>!!g;
# strip comments
$data =~ s@<!--.*?-->@@gsm;
#print "converting URL's ('$server' -> '$td')\n";
# output data
$f = "$out/$f";
open (OUTF, ">$f");
print OUTF $data;
close OUTF;
}
Platon Group <platon@platon.org> http://platon.org/
|