From: Nicolas Boisselier Date: Mon, 21 May 2018 14:53:41 +0000 (+0100) Subject: bin/html2csv X-Git-Url: https://git.nbdom.net/?a=commitdiff_plain;h=b603f85bb6a373cb171ae4b581ad5642a1ef29f2;p=nb.git bin/html2csv --- diff --git a/bin/html-table2csv b/bin/html-table2csv deleted file mode 100755 index 29fd4900..00000000 --- a/bin/html-table2csv +++ /dev/null @@ -1,222 +0,0 @@ -#!/usr/bin/env perl -use strict; -use warnings; -#use LWP::Simple qw/get/; -use NB::Functions qw/html2txt str_trim/; -################################################################################# -# -# VERSION -# -################################################################################# -my $VERSION = '0.0.1'; -# NB 25.01.17 -# - create script: html-table2csv - -################################################################################# -# -# GLOBALS -# -################################################################################# -my ($NAME) = $0 =~ m,([^/]+)$,; - -################################################################################# -# -# ARGS -# -################################################################################# -my $VERBOSE = $main::VERBOSE = 1; -my $DEBUG = $main::DEBUG = 0; - -my %Opt = ( - 'nume' => undef, - 'tag' => 'table', - 'sep' => "\t", - 'html' => 0, -); -get_options(\%Opt); -help() unless @ARGV; -$main::_DATA_ = undef; - -################################################################################# -# -# BEGIN -# -################################################################################# -@ARGV = map {m,^\w+://, ? "curl -s '$_' |" : $_} @ARGV if @ARGV; -my $html = join('',<>); -$html =~ s/[\r\n]+//g; -$html =~ s/\s+/ /g; - -my %TAGS = ( - 'table' => [ - 'table', - 'tr', - 't[dh]', - ], - 'dl' => [ - 'dl', - '', - 'd[td]', - ], -); -my ($T1,$T2,$T3) = @{ $TAGS{$Opt{tag}} }; -my $table_num = 0; - -for my $table ($html =~ m,<${T1}[^>]*>(.*?)<\s*/\s*${T1}\s*>,gi) { - $table_num++; - #warn $table_num; - next if defined $Opt{'num'} and $Opt{num} and $Opt{num} != $table_num; - - $table = "<>$table" unless $T2; - for my $tr ($table =~ m,<${T2}[^>]*>(.*?)<\s*/\s*${T2}\s*>,gi) { - my @col; - - for my $td ($tr =~ m,<${T3}[^>]*>(.*?)<\s*/\s*${T3}\s*>,gi) { - $td = html2txt($td) unless $Opt{html}; - $_ = chr(194).chr(160); $td =~ s/$_/ /g; - $td =~ s/\s+/ /g; - $td = str_trim($td); - push(@col,$td); - } - - print join($Opt{sep},@col)."\n"; - - } - -} - -################################################################################# -# -# END -# -################################################################################# -exit 0; - -################################################################################# -# -# Functions -# -################################################################################# -sub help { -#------------------------------------------------------------------------------ -# Print help and exit -#------------------------------------------------------------------------------ - - require 'Pod/Usage.pm' unless $INC{'Pod/Usage.pm'}; - require 'Pod/Perldoc.pm' unless $INC{'Pod/Perldoc.pm'}; - - # Substitutions - sub pod_env { - my $v = ''; - eval '$v = ref(\\'.$_[0].') eq "ARRAY" ? join(" ",'.$_[0].') : '.$_[0].'; return defined $v ? $v : qq|UNDEF|;'; - return $v; - } - - $main::_DATA_ =~ s/([@\$][A-Z_a-z\{\}]+)/pod_env($1)/eg; - - # Create tmp - my $in_file = (-e '/dev/shm' ? '/dev/shm' : '/tmp')."/$NAME.$$"; - my $in; - open($in,">$in_file") or die "$NAME: Can't write into $in_file: $!"; - print $in $main::_DATA_; - close $in; - - # Output - open(STDOUT,"|perl -pe 's/\.$$//g'".(($ENV{PAGER}||'') eq 'less' ? "|less -FRi" : "")); - my $opts = { - -input => $in_file, - -ouput => \*STDOUT, - -exitval => 'noexit', - -sections => [qw(SYNOPSIS DESCRIPTION OPTIONS)], - -verbose => ($Opt{'help'} ? 99 : 3), - }; - - Pod::Usage::pod2usage($opts); - close STDOUT; - unlink $in_file if $in_file and -e $in_file; - - exit 0; -} - -#------------------------------------------------------------------------------ -# Print version and exit -#------------------------------------------------------------------------------ -sub version { print "$NAME: version [$VERSION]\n"; exit 0; } - -#------------------------------------------------------------------------------ -# Get options from pod -#------------------------------------------------------------------------------ -sub get_options { - - use Getopt::Long qw(:config no_ignore_case no_auto_abbrev); - - my @Opt; - - sub pod_opt { - local $_; - my $o = shift; - $o =~ s/(=.|[\+\-\!]$)//; - $o = join(", ",map{"-$_"} split(/[\|,:;]/,$o)); - return "$o"; - } - - while () { - s/option\[([^\]]+)\]/push(@Opt,$1) and pod_opt($1)/eg; - $main::_DATA_ .= $_; - } - - GetOptions($_[0],@Opt) || exit -1; - - help() if $_[0]{'help'} or $_[0]{'man'}; - version() if $_[0]{'version'}; - - $main::VERBOSE = $VERBOSE = $_[0]{'verbose'} if defined $_[0]{'verbose'}; - $main::DEBUG = $DEBUG = $_[0]{'debug'} if defined $_[0]{'debug'}; - -} - -__DATA__ - -=head1 NAME - -$NAME - Script to extract html table into csv - -=head1 SYNOPSIS - -Quick usage: - -=over - -=item $NAME --verbose - -=item $NAME --help - -=back - -=head1 OPTIONS - - -option[tag|T=s] Default: table - -option[sep|s=s] Default: tab - -option[html!] Escape html (default: yes) - -option[num|n=i] Only dump table number - -option[verbose|v+] Verbose mode: increase the verbosity level. - -option[debug+] Debug mode: increase the verbosity level. - -option[version|V] Print version (default: $VERSION) - -option[help|h|?] Print a brief help message and exits. - -option[man] Print the manual page and exits. - -=cut - -=head1 COPYRIGHT AND LICENSE - -Copyright (C) 2017 Nicolas Boisselier - -This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. - -See . - -=head1 AUTHOR - -Nicolas Boisselier - -=cut diff --git a/bin/html2csv b/bin/html2csv new file mode 100755 index 00000000..0e280c5d --- /dev/null +++ b/bin/html2csv @@ -0,0 +1,232 @@ +#!/usr/bin/env perl +use strict; +use warnings; +#use LWP::Simple qw/get/; +use NB::Functions qw/html2txt str_trim/; +################################################################################# +# +# VERSION +# +################################################################################# +my $VERSION = '0.0.1'; +# NB 25.01.17 +# - create script: html-table2csv + +################################################################################# +# +# GLOBALS +# +################################################################################# +my ($NAME) = $0 =~ m,([^/]+)$,; + +################################################################################# +# +# ARGS +# +################################################################################# +my $VERBOSE = $main::VERBOSE = 1; +my $DEBUG = $main::DEBUG = 0; + +my %Opt = ( + 'num' => undef, + 'tag' => 'table', + 'exp1'=> '', + 'exp2'=> '', + 'exp3'=> '', + 'sep' => "\t", + 'html' => 0, +); +get_options(\%Opt); +help() unless @ARGV; +$main::_DATA_ = undef; + +################################################################################# +# +# BEGIN +# +################################################################################# +@ARGV = map {m,^\w+://, ? "curl -s '$_' |" : $_} @ARGV if @ARGV; +my $html = join('',<>); +$html =~ s/[\r\n]+//g; +$html =~ s/\s+/ /g; + +my %TAGS = ( + 'table' => [ + 'table', + 'tr', + 't[dh]', + ], + 'dl' => [ + 'dl', + '', + 'd[td]', + ], +); +exists $TAGS{$Opt{tag}} or die "$NAME: Unknown tag '$Opt{tag}'\n"; +my ($T1,$T2,$T3) = @{ $TAGS{$Opt{tag}} }; +my $table_num = 0; + +my $exp1 = $Opt{exp1}; $exp1 and $exp1 = "[^>]*$exp1"; +my $exp2 = $Opt{exp2}; $exp2 and $exp2 = "[^>]*$exp2"; +my $exp3 = $Opt{exp3}; $exp3 and $exp3 = "[^>]*$exp3"; +for my $table ($html =~ m,<${T1}${exp1}[^>]*>(.*?)<\s*/\s*${T1}\s*>,gi) { + $table_num++; + #warn $table_num; + next if defined $Opt{'num'} and $Opt{num} and $Opt{num} != $table_num; + + $table = "<>$table" unless $T2; + for my $tr ($table =~ m,<${T2}${exp2}[^>]*>(.*?)<\s*/\s*${T2}\s*>,gi) { + my @col; + + for my $td ($tr =~ m,<${T3}${exp3}[^>]*>(.*?)<\s*/\s*${T3}\s*>,gi) { + $td = html2txt($td) unless $Opt{html}; + $_ = chr(194).chr(160); $td =~ s/$_/ /g; + $td =~ s/\s+/ /g; + $td = str_trim($td); + push(@col,$td); + } + + print join($Opt{sep},@col)."\n"; + + } + +} + +################################################################################# +# +# END +# +################################################################################# +exit 0; + +################################################################################# +# +# Functions +# +################################################################################# +sub help { +#------------------------------------------------------------------------------ +# Print help and exit +#------------------------------------------------------------------------------ + + require 'Pod/Usage.pm' unless $INC{'Pod/Usage.pm'}; + require 'Pod/Perldoc.pm' unless $INC{'Pod/Perldoc.pm'}; + + # Substitutions + sub pod_env { + my $v = ''; + eval '$v = ref(\\'.$_[0].') eq "ARRAY" ? join(" ",'.$_[0].') : '.$_[0].'; return defined $v ? $v : qq|UNDEF|;'; + return $v; + } + + $main::_DATA_ =~ s/([@\$][A-Z_a-z\{\}]+)/pod_env($1)/eg; + + # Create tmp + my $in_file = (-e '/dev/shm' ? '/dev/shm' : '/tmp')."/$NAME.$$"; + my $in; + open($in,">$in_file") or die "$NAME: Can't write into $in_file: $!"; + print $in $main::_DATA_; + close $in; + + # Output + open(STDOUT,"|perl -pe 's/\.$$//g'".(($ENV{PAGER}||'') eq 'less' ? "|less -FRi" : "")); + my $opts = { + -input => $in_file, + -ouput => \*STDOUT, + -exitval => 'noexit', + -sections => [qw(SYNOPSIS DESCRIPTION OPTIONS)], + -verbose => ($Opt{'help'} ? 99 : 3), + }; + + Pod::Usage::pod2usage($opts); + close STDOUT; + unlink $in_file if $in_file and -e $in_file; + + exit 0; +} + +#------------------------------------------------------------------------------ +# Print version and exit +#------------------------------------------------------------------------------ +sub version { print "$NAME: version [$VERSION]\n"; exit 0; } + +#------------------------------------------------------------------------------ +# Get options from pod +#------------------------------------------------------------------------------ +sub get_options { + + use Getopt::Long qw(:config no_ignore_case no_auto_abbrev); + + my @Opt; + + sub pod_opt { + local $_; + my $o = shift; + $o =~ s/(=.|[\+\-\!]$)//; + $o = join(", ",map{"-$_"} split(/[\|,:;]/,$o)); + return "$o"; + } + + while () { + s/option\[([^\]]+)\]/push(@Opt,$1) and pod_opt($1)/eg; + $main::_DATA_ .= $_; + } + + GetOptions($_[0],@Opt) || exit -1; + + help() if $_[0]{'help'} or $_[0]{'man'}; + version() if $_[0]{'version'}; + + $main::VERBOSE = $VERBOSE = $_[0]{'verbose'} if defined $_[0]{'verbose'}; + $main::DEBUG = $DEBUG = $_[0]{'debug'} if defined $_[0]{'debug'}; + +} + +__DATA__ + +=head1 NAME + +$NAME - Script to extract html table into csv + +=head1 SYNOPSIS + +Quick usage: + +=over + +=item $NAME --verbose + +=item $NAME --help + +=back + +=head1 OPTIONS + + -option[tag|T=s] Default: table + -option[sep|s=s] Default: tab (\t) + -option[html!] Escape html (default: yes) + -option[num|n=i] Only dump table number + -option[exp1|E1=s] Regexp filter on tag 1 + -option[exp2|E2=s] Regexp filter on tag 2 + -option[exp3|E3=s] Regexp filter on tag 3 + -option[verbose|v+] Verbose mode: increase the verbosity level. + -option[debug+] Debug mode: increase the verbosity level. + -option[version|V] Print version (default: $VERSION) + -option[help|h|?] Print a brief help message and exits. + -option[man] Print the manual page and exits. + +=cut + +=head1 COPYRIGHT AND LICENSE + +Copyright (C) 2017 Nicolas Boisselier + +This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. + +See . + +=head1 AUTHOR + +Nicolas Boisselier + +=cut