#! /usr/bin/perl
#Основан на [[:en:User:Bo Lindbergh/dabalyze]]

use strict;

my %interesting=
    ('' => {
        name            => 'article',
        filename        => 'articles.txt',
        cutoff          => 10},
     'Template' => {
         name           => 'template',
         filename       => 'templates.txt',
         cutoff         => 0,
         list           => 1});

my $exp_re=qr/\(значения\)$/;

my @templates=split(/\n/,<<__EOT__);
Disambig
Disambiguation
Неоднозначность
__EOT__

foreach my $template (@templates) {
    $template =~ s/^([[:alpha:]])/[$1\L$1]/;
}

my $tmpl_re=join('|',sort({$b cmp $a} @templates));

my $dab_re=qr/{{(?i:msg:)?\s*(?i:(?:template|шаблон)\s*:\s*)?($tmpl_re)\s*}}/;

my($ns_re,%ns_canon);

my $want_progress=@ARGV>0 && $ARGV[0] eq '-p';
my $last_progress=-1;

sub pageloop (&)
{
    my($handler)=@_;
    my($size);
    local $/="</page>\x0A";

    $size=-s PAGES;
    while (defined(my $page=<PAGES>)) {
        my($nstitle,$ns,$title);

        $page =~ /^\s*<page>/ or last;
        ($nstitle)=($page =~ m{<title>([^<]+)</title>})
            or die "Can't find page title";
        if ($nstitle =~ /^($ns_re):(.+)$/) {
            $ns=$1;
            $title=$2;
        } else {
            $ns='';
            $title=$nstitle;
        }
        $page =~ m{</text>} or next;
        substr($page,$-[0])='';
        $page =~ /<text xml:space="preserve">/
            or die "Can't find start of text for page $nstitle";
        substr($page,0,$+[0])='';
        $handler->($nstitle,$ns,$title,$page);
        if ($want_progress) {
            my $progress=int(tell(PAGES)/$size*1000);
            if ($progress!=$last_progress) {
                $last_progress=$progress;
                printf STDERR "\r0.%.3u",$progress;
            }
        }
    }
    if ($want_progress) {
        print STDERR "\r";
    }
}

sub mungtarget ($$$ )
{
    my(undef,$source,$sub)=@_;

    for my $target ($_[0]) {
        $target =~ tr/\t\n\r/   /;
        $target =~ s/^ +//;
        $target =~ s/ +$//;
        $target =~ s/ {2,}/ /g;
        if ($sub && $target =~ m{^/}) {
            $target=$source.$target;
        } elsif ($target =~ /^:*($ns_re) *: *(.+)$/i) {
            $target=$2;
            utf8::decode($target);
            $target=ucfirst($target);
            utf8::encode($target);
            $target=$ns_canon{lc($1)}.":".$target;
        } elsif ($target =~ /^:*(.+)$/i) {
            $target=$1;
            utf8::decode($target);
            $target=ucfirst($target);
            utf8::encode($target);
        } else {
            # a malformed link, usually empty brackets
        }
    }
}

my(%dab,%redir,@circular);

sub pass1 ()
{
    print STDERR "Analysis pass 1\n";
    {
        my($siteinfo,@namespaces);
        local $/="</siteinfo>\x0A";

        $siteinfo=<PAGES>;
        @namespaces=
            $siteinfo =~ m{<namespace key="-?\d+">([^<]+)</namespace>}g;
        $ns_re=join('|',map(quotemeta($_),sort({$b cmp $a} @namespaces)));
        foreach my $ns (@namespaces) {
            $ns_canon{lc($ns)}=$ns;
        }
    }
    pageloop {
        my($nstitle,$ns,$title)=splice(@_,0,3);

        for my $text ($_[0]) {
            my $sub=$interesting{$ns}->{subpages};

            if ($ns eq '' && $text =~ $dab_re) {
                $dab{$nstitle}=1;
            }
            if ($text =~ /^#redirect.*\[\[([^\]\|]+)/i) {
                my($target,$back);

                $target=$1;
                mungtarget($target,$nstitle,$sub);
                while ($target ne $nstitle) {
                    my($newtarget);

                    $newtarget=$redir{$target};
                    last unless defined($newtarget);
                    $target=$newtarget;
                }
                if ($target eq $nstitle) {
                    push(@circular,$nstitle);
                } else {
                    $redir{$nstitle}=$target;
                }
            }
        }
    };
    foreach my $target (keys(%redir)) {
        my(@chain);

        for (;;) {
            my $newtarget=$redir{$target};
            last unless defined($newtarget);
            push(@chain,$target);
            $target=$newtarget;
        }
        pop(@chain);
        foreach my $source (@chain) {
            $redir{$source}=$target;
        }
    }

    print STDERR "    ".keys(%dab)." total disambiguation pages\n";
    print STDERR "\n";
}

my %stats=map {
    ($_,{});
} keys(%interesting);

my %lists=map {
    ($_,{});
} grep {
    $interesting{$_}->{list};
} keys(%interesting);

sub pass2 ()
{
    my(%linked);

    print STDERR "Analysis pass 2\n";
    {
        local $/="</siteinfo>\x0A";

        <PAGES>;
    }
    pageloop {
        my($nstitle,$ns,$title)=splice(@_,0,3);

        for my $text ($_[0]) {
            my($stats,$lists,$sub);

            $stats=$stats{$ns};
            $lists=$lists{$ns};
            $sub=$interesting{$ns}->{subpages};
            if ($stats) {
                my(%seen);

                while ($text =~ /\[\[([^\]\|]+)/g) {
                    my($target,$final);

                    $target=$1;
                    mungtarget($target,$nstitle,$sub);
                    next if $target =~ $exp_re;
                    $final=$redir{$target};
                    $final=$target unless defined($final);
                    if ($dab{$final} && !$seen{$final}++) {
                        $linked{$final}=1;
                        $stats->{$final}++;
                        if ($lists) {
                            push(@{$lists->{$final}},$nstitle);
                        }
                    }
                }
            }
        }
    };
    print STDERR "    ".keys(%linked)." linked disambiguation pages\n";
    foreach my $ns (sort(keys(%stats))) {
        print STDERR ("    ".keys(%{$stats{$ns}})." in the ".
                      $interesting{$ns}->{name}." namespace\n");
    }
    print STDERR "\n";
}

sub wikilink ($ )
{
    my($target)=@_;

    if (exists($redir{$target})) {
        "[{{SERVER}}{{localurl:$target|redirect=no}} $target]";
    } elsif ($target =~ m{/\.{1,2}(?:$|/)}) {
        "[{{SERVER}}{{localurl:$target}} $target]";
    } elsif ($target =~ m{^/}) {
        "[[:$target]]";
    } else {
        "[[$target]]";
    }
}

sub report ()
{
    print STDERR "Report generation\n";

    foreach my $target (@circular) {
        $redir{$target}=$target;
    }

    while (my($ns,$stats)=each(%stats)) {
        my($filename,$cutoff)=@{$interesting{$ns}}{qw(filename cutoff)};
        my $lists=$lists{$ns};
        my @nstitles=sort {
            $stats->{$b}<=>$stats->{$a} || $a cmp $b;
        } grep {
            $stats->{$_}>=$cutoff;
        } keys(%{$stats});
        my $total=0;

        open(REPORT,'>',$filename)
            or die "Can't create $filename: $!";
        binmode(REPORT);
        print REPORT "\xEF\xBB\xBF";
        foreach my $nstitle (@nstitles) {
            $total+=$stats->{$nstitle};
        }
        print REPORT "Всего найдено $total ссылок.\n";
        foreach my $nstitle (@nstitles) {
            print REPORT ("# ",wikilink($nstitle),": ",$stats->{$nstitle},
                          " [[Special:Whatlinkshere/",$nstitle,"|ссылок]]\n");
            if ($lists) {
                foreach my $source (sort(@{$lists->{$nstitle}})) {
                    print REPORT "#* ",wikilink($source),"\n";
                }
            }
        }
        close(REPORT);
        print STDERR "    ".@nstitles." entries written to $filename\n";
    }

    if (@circular) {
        @circular=sort(@circular);
        open(REPORT,'>','circular.txt')
            or die "Can't create circular.txt: $!";
        binmode(REPORT);
        print REPORT "\xEF\xBB\xBF";
        foreach my $target (@circular) {
            print REPORT "* ",wikilink($target),"\n";
        }
        close(REPORT);
        print STDERR "    ".@circular." entries written to circular.txt\n";
    } else {
        unlink('circular.txt');
    }
}

open(PAGES,'<','pages_current.xml')
    or die "Can't open pages_current.xml: $!";
binmode(PAGES);
pass1();
seek(PAGES,0,0);
pass2();
close(PAGES);
report();