+++ /dev/null
-#!/usr/bin/env perl
-use strict;
-use warnings;
-#use LWP::Simple qw/get/;
-use NB::Functions qw/html2txt str_trim/;
-#################################################################################
-#
-# VERSION
-#
-#################################################################################
-my $VERSION = '0.0.1';
-# NB 25.01.17
-# - create script: html-table2csv
-
-#################################################################################
-#
-# GLOBALS
-#
-#################################################################################
-my ($NAME) = $0 =~ m,([^/]+)$,;
-
-#################################################################################
-#
-# ARGS
-#
-#################################################################################
-my $VERBOSE = $main::VERBOSE = 1;
-my $DEBUG = $main::DEBUG = 0;
-
-my %Opt = (
- 'nume' => undef,
- 'tag' => 'table',
- 'sep' => "\t",
- 'html' => 0,
-);
-get_options(\%Opt);
-help() unless @ARGV;
-$main::_DATA_ = undef;
-
-#################################################################################
-#
-# BEGIN
-#
-#################################################################################
-@ARGV = map {m,^\w+://, ? "curl -s '$_' |" : $_} @ARGV if @ARGV;
-my $html = join('',<>);
-$html =~ s/[\r\n]+//g;
-$html =~ s/\s+/ /g;
-
-my %TAGS = (
- 'table' => [
- 'table',
- 'tr',
- 't[dh]',
- ],
- 'dl' => [
- 'dl',
- '',
- 'd[td]',
- ],
-);
-my ($T1,$T2,$T3) = @{ $TAGS{$Opt{tag}} };
-my $table_num = 0;
-
-for my $table ($html =~ m,<${T1}[^>]*>(.*?)<\s*/\s*${T1}\s*>,gi) {
- $table_num++;
- #warn $table_num;
- next if defined $Opt{'num'} and $Opt{num} and $Opt{num} != $table_num;
-
- $table = "<>$table</>" unless $T2;
- for my $tr ($table =~ m,<${T2}[^>]*>(.*?)<\s*/\s*${T2}\s*>,gi) {
- my @col;
-
- for my $td ($tr =~ m,<${T3}[^>]*>(.*?)<\s*/\s*${T3}\s*>,gi) {
- $td = html2txt($td) unless $Opt{html};
- $_ = chr(194).chr(160); $td =~ s/$_/ /g;
- $td =~ s/\s+/ /g;
- $td = str_trim($td);
- push(@col,$td);
- }
-
- print join($Opt{sep},@col)."\n";
-
- }
-
-}
-
-#################################################################################
-#
-# END
-#
-#################################################################################
-exit 0;
-
-#################################################################################
-#
-# Functions
-#
-#################################################################################
-sub help {
-#------------------------------------------------------------------------------
-# Print help and exit
-#------------------------------------------------------------------------------
-
- require 'Pod/Usage.pm' unless $INC{'Pod/Usage.pm'};
- require 'Pod/Perldoc.pm' unless $INC{'Pod/Perldoc.pm'};
-
- # Substitutions
- sub pod_env {
- my $v = '';
- eval '$v = ref(\\'.$_[0].') eq "ARRAY" ? join(" ",'.$_[0].') : '.$_[0].'; return defined $v ? $v : qq|UNDEF|;';
- return $v;
- }
-
- $main::_DATA_ =~ s/([@\$][A-Z_a-z\{\}]+)/pod_env($1)/eg;
-
- # Create tmp
- my $in_file = (-e '/dev/shm' ? '/dev/shm' : '/tmp')."/$NAME.$$";
- my $in;
- open($in,">$in_file") or die "$NAME: Can't write into $in_file: $!";
- print $in $main::_DATA_;
- close $in;
-
- # Output
- open(STDOUT,"|perl -pe 's/\.$$//g'".(($ENV{PAGER}||'') eq 'less' ? "|less -FRi" : ""));
- my $opts = {
- -input => $in_file,
- -ouput => \*STDOUT,
- -exitval => 'noexit',
- -sections => [qw(SYNOPSIS DESCRIPTION OPTIONS)],
- -verbose => ($Opt{'help'} ? 99 : 3),
- };
-
- Pod::Usage::pod2usage($opts);
- close STDOUT;
- unlink $in_file if $in_file and -e $in_file;
-
- exit 0;
-}
-
-#------------------------------------------------------------------------------
-# Print version and exit
-#------------------------------------------------------------------------------
-sub version { print "$NAME: version [$VERSION]\n"; exit 0; }
-
-#------------------------------------------------------------------------------
-# Get options from pod
-#------------------------------------------------------------------------------
-sub get_options {
-
- use Getopt::Long qw(:config no_ignore_case no_auto_abbrev);
-
- my @Opt;
-
- sub pod_opt {
- local $_;
- my $o = shift;
- $o =~ s/(=.|[\+\-\!]$)//;
- $o = join(", ",map{"-$_"} split(/[\|,:;]/,$o));
- return "$o";
- }
-
- while (<DATA>) {
- s/option\[([^\]]+)\]/push(@Opt,$1) and pod_opt($1)/eg;
- $main::_DATA_ .= $_;
- }
-
- GetOptions($_[0],@Opt) || exit -1;
-
- help() if $_[0]{'help'} or $_[0]{'man'};
- version() if $_[0]{'version'};
-
- $main::VERBOSE = $VERBOSE = $_[0]{'verbose'} if defined $_[0]{'verbose'};
- $main::DEBUG = $DEBUG = $_[0]{'debug'} if defined $_[0]{'debug'};
-
-}
-
-__DATA__
-
-=head1 NAME
-
-$NAME - Script to extract html table into csv
-
-=head1 SYNOPSIS
-
-Quick usage:
-
-=over
-
-=item $NAME --verbose
-
-=item $NAME --help
-
-=back
-
-=head1 OPTIONS
-
- -option[tag|T=s] Default: table
- -option[sep|s=s] Default: tab
- -option[html!] Escape html (default: yes)
- -option[num|n=i] Only dump table number
- -option[verbose|v+] Verbose mode: increase the verbosity level.
- -option[debug+] Debug mode: increase the verbosity level.
- -option[version|V] Print version (default: $VERSION)
- -option[help|h|?] Print a brief help message and exits.
- -option[man] Print the manual page and exits.
-
-=cut
-
-=head1 COPYRIGHT AND LICENSE
-
-Copyright (C) 2017 Nicolas Boisselier
-
-This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
-
-See <http://www.gnu.org/licenses/>.
-
-=head1 AUTHOR
-
-Nicolas Boisselier <nicolas.boisselier@gmail.com>
-
-=cut
--- /dev/null
+#!/usr/bin/env perl
+use strict;
+use warnings;
+#use LWP::Simple qw/get/;
+use NB::Functions qw/html2txt str_trim/;
+#################################################################################
+#
+# VERSION
+#
+#################################################################################
+my $VERSION = '0.0.1';
+# NB 25.01.17
+# - create script: html-table2csv
+
+#################################################################################
+#
+# GLOBALS
+#
+#################################################################################
+my ($NAME) = $0 =~ m,([^/]+)$,;
+
+#################################################################################
+#
+# ARGS
+#
+#################################################################################
+my $VERBOSE = $main::VERBOSE = 1;
+my $DEBUG = $main::DEBUG = 0;
+
+my %Opt = (
+ 'num' => undef,
+ 'tag' => 'table',
+ 'exp1'=> '',
+ 'exp2'=> '',
+ 'exp3'=> '',
+ 'sep' => "\t",
+ 'html' => 0,
+);
+get_options(\%Opt);
+help() unless @ARGV;
+$main::_DATA_ = undef;
+
+#################################################################################
+#
+# BEGIN
+#
+#################################################################################
+@ARGV = map {m,^\w+://, ? "curl -s '$_' |" : $_} @ARGV if @ARGV;
+my $html = join('',<>);
+$html =~ s/[\r\n]+//g;
+$html =~ s/\s+/ /g;
+
+my %TAGS = (
+ 'table' => [
+ 'table',
+ 'tr',
+ 't[dh]',
+ ],
+ 'dl' => [
+ 'dl',
+ '',
+ 'd[td]',
+ ],
+);
+exists $TAGS{$Opt{tag}} or die "$NAME: Unknown tag '$Opt{tag}'\n";
+my ($T1,$T2,$T3) = @{ $TAGS{$Opt{tag}} };
+my $table_num = 0;
+
+my $exp1 = $Opt{exp1}; $exp1 and $exp1 = "[^>]*$exp1";
+my $exp2 = $Opt{exp2}; $exp2 and $exp2 = "[^>]*$exp2";
+my $exp3 = $Opt{exp3}; $exp3 and $exp3 = "[^>]*$exp3";
+for my $table ($html =~ m,<${T1}${exp1}[^>]*>(.*?)<\s*/\s*${T1}\s*>,gi) {
+ $table_num++;
+ #warn $table_num;
+ next if defined $Opt{'num'} and $Opt{num} and $Opt{num} != $table_num;
+
+ $table = "<>$table</>" unless $T2;
+ for my $tr ($table =~ m,<${T2}${exp2}[^>]*>(.*?)<\s*/\s*${T2}\s*>,gi) {
+ my @col;
+
+ for my $td ($tr =~ m,<${T3}${exp3}[^>]*>(.*?)<\s*/\s*${T3}\s*>,gi) {
+ $td = html2txt($td) unless $Opt{html};
+ $_ = chr(194).chr(160); $td =~ s/$_/ /g;
+ $td =~ s/\s+/ /g;
+ $td = str_trim($td);
+ push(@col,$td);
+ }
+
+ print join($Opt{sep},@col)."\n";
+
+ }
+
+}
+
+#################################################################################
+#
+# END
+#
+#################################################################################
+exit 0;
+
+#################################################################################
+#
+# Functions
+#
+#################################################################################
+sub help {
+#------------------------------------------------------------------------------
+# Print help and exit
+#------------------------------------------------------------------------------
+
+ require 'Pod/Usage.pm' unless $INC{'Pod/Usage.pm'};
+ require 'Pod/Perldoc.pm' unless $INC{'Pod/Perldoc.pm'};
+
+ # Substitutions
+ sub pod_env {
+ my $v = '';
+ eval '$v = ref(\\'.$_[0].') eq "ARRAY" ? join(" ",'.$_[0].') : '.$_[0].'; return defined $v ? $v : qq|UNDEF|;';
+ return $v;
+ }
+
+ $main::_DATA_ =~ s/([@\$][A-Z_a-z\{\}]+)/pod_env($1)/eg;
+
+ # Create tmp
+ my $in_file = (-e '/dev/shm' ? '/dev/shm' : '/tmp')."/$NAME.$$";
+ my $in;
+ open($in,">$in_file") or die "$NAME: Can't write into $in_file: $!";
+ print $in $main::_DATA_;
+ close $in;
+
+ # Output
+ open(STDOUT,"|perl -pe 's/\.$$//g'".(($ENV{PAGER}||'') eq 'less' ? "|less -FRi" : ""));
+ my $opts = {
+ -input => $in_file,
+ -ouput => \*STDOUT,
+ -exitval => 'noexit',
+ -sections => [qw(SYNOPSIS DESCRIPTION OPTIONS)],
+ -verbose => ($Opt{'help'} ? 99 : 3),
+ };
+
+ Pod::Usage::pod2usage($opts);
+ close STDOUT;
+ unlink $in_file if $in_file and -e $in_file;
+
+ exit 0;
+}
+
+#------------------------------------------------------------------------------
+# Print version and exit
+#------------------------------------------------------------------------------
+sub version { print "$NAME: version [$VERSION]\n"; exit 0; }
+
+#------------------------------------------------------------------------------
+# Get options from pod
+#------------------------------------------------------------------------------
+sub get_options {
+
+ use Getopt::Long qw(:config no_ignore_case no_auto_abbrev);
+
+ my @Opt;
+
+ sub pod_opt {
+ local $_;
+ my $o = shift;
+ $o =~ s/(=.|[\+\-\!]$)//;
+ $o = join(", ",map{"-$_"} split(/[\|,:;]/,$o));
+ return "$o";
+ }
+
+ while (<DATA>) {
+ s/option\[([^\]]+)\]/push(@Opt,$1) and pod_opt($1)/eg;
+ $main::_DATA_ .= $_;
+ }
+
+ GetOptions($_[0],@Opt) || exit -1;
+
+ help() if $_[0]{'help'} or $_[0]{'man'};
+ version() if $_[0]{'version'};
+
+ $main::VERBOSE = $VERBOSE = $_[0]{'verbose'} if defined $_[0]{'verbose'};
+ $main::DEBUG = $DEBUG = $_[0]{'debug'} if defined $_[0]{'debug'};
+
+}
+
+__DATA__
+
+=head1 NAME
+
+$NAME - Script to extract html table into csv
+
+=head1 SYNOPSIS
+
+Quick usage:
+
+=over
+
+=item $NAME --verbose
+
+=item $NAME --help
+
+=back
+
+=head1 OPTIONS
+
+ -option[tag|T=s] Default: table
+ -option[sep|s=s] Default: tab (\t)
+ -option[html!] Escape html (default: yes)
+ -option[num|n=i] Only dump table number
+ -option[exp1|E1=s] Regexp filter on tag 1
+ -option[exp2|E2=s] Regexp filter on tag 2
+ -option[exp3|E3=s] Regexp filter on tag 3
+ -option[verbose|v+] Verbose mode: increase the verbosity level.
+ -option[debug+] Debug mode: increase the verbosity level.
+ -option[version|V] Print version (default: $VERSION)
+ -option[help|h|?] Print a brief help message and exits.
+ -option[man] Print the manual page and exits.
+
+=cut
+
+=head1 COPYRIGHT AND LICENSE
+
+Copyright (C) 2017 Nicolas Boisselier
+
+This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
+
+See <http://www.gnu.org/licenses/>.
+
+=head1 AUTHOR
+
+Nicolas Boisselier <nicolas.boisselier@gmail.com>
+
+=cut