#!/usr/bin/perl # Usage: perl crd-reference-v2.pl < crd-reference-v2.xml use strict; use warnings; use open ":utf8"; use open ":std"; use HTML::Entities; my @cols = qw(sys-id reg-date lst-date lib-id lib-name reg-id crt-date solution keyword class res-type con-type bibl-desc bibl-isbn bibl-note referral ptn-type contri file-num); my @l_cols = qw(question answer ans-proc pre-res note); open(O1, "> crd-reference-v2.txt"); open(O2, "> crd-reference-v2-urls.txt"); print O1 join("\t", @cols, map { sprintf "%s (length)", $_ } @l_cols) . "\n"; print O2 join("\t", 'sys-id', 'name', 'url', 'fqdn', 'tld', 'sld') . "\n"; my $result = ''; while () { $result .= $_; if ($_ =~ m!!) { my %el; while ($result =~ m!<([^>]+)>([^<]+)[0] =~ /^(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})$/) { $el{'reg-date'}->[0] = sprintf "%04d-%02d-%02d %02d:%02d:%02d", $1, $2, $3, $4, $5, $6; } if ($el{'lst-date'}->[0] =~ /^(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})$/) { $el{'lst-date'}->[0] = sprintf "%04d-%02d-%02d %02d:%02d:%02d", $1, $2, $3, $4, $5, $6; } if ($el{'crt-date'}->[0] && $el{'crt-date'}->[0] =~ /^(\d{4})(\d{2})(\d{2})$/) { $el{'crt-date'}->[0] = sprintf "%04d-%02d-%02d", $1, $2, $3; } print O1 join("\t", (map { ($el{$_}) ? join(";", @{$el{$_}}) : '' } @cols), (map { ($el{$_}) ? length($el{$_}->[0]) : '' } @l_cols)) . "\n"; while (my ($key, $value) = each %el) { if ($key eq 'url') { next; } foreach my $content (@{$value}) { my @urls = get_urls($content); foreach my $url (@urls) { my $fqdn; if ($url =~ m!^https?://([^/]+)!i) { $fqdn = $1; } my @host = split(/\./, $fqdn); print O2 join("\t", $el{'sys-id'}->[0], $key, $url, $fqdn, $host[-1], "$host[-2].$host[-1]") . "\n"; } } } undef $result; $result = ''; } } close(O1); close(O2); sub normalization { my $str = shift; $str =~ s/^\s+//; $str =~ s/\s+$//; $str =~ s/\t/ /g; $str =~ s/\n/ /g; $str = decode_entities($str); return $str; } sub get_urls { my $content = shift; my $re_url = 'https?://[\w\-.!~*\';/?:@&=+$,%]+'; # ()# my %URL; { use bytes; while ($content =~ m!]*>!ig) { $URL{$1}++; } while ($content =~ m!($re_url)!go) { $URL{$1}++; } } return sort keys %URL; }