#!/usr/bin/perl
### fdedup.perl  -*- Perl -*-
## Issue FIDEDUPERANGE ioctl(2) against files specified.

### Ivan Shmakov, 2017

## To the extent possible under law, the author(s) have dedicated
## all copyright and related and neighboring rights to this software
## to the public domain worldwide.  This software is distributed
## without any warranty.

## You should have received a copy of the CC0 Public Domain Dedication
## along with this software.  If not, see
## <http://creativecommons.org/publicdomain/zero/1.0/>.

### Code:

use common::sense;
use English qw (-no_match_vars);

# require Data::Dump;
require Getopt::Long;
require IO::File;
# require IO::Handle;

our $dev_fd_prefix
    = ($ENV{"DEVFDPREFIX"} // "/dev/fd/");

## FIXME: use an appropriate .ph module
my $ion
    = (((2 | 1) << (0 + 8 + 8 + 14))
       | (0x94 << (0 + 8))
       | (54 << 0)
       | ((8 + 8 + 2 + 2 + 4) << (0 + 8 + 8)));
# warn ("D: ", sprintf ("%x", $ion), "\n") if (0);

sub min {
    my $v = shift;
    foreach my $x (@_) {
        $v = $x if ($v > $x);
    }
    ## .
    $v;
}

sub open_file {
    my ($fn, $mode) = @_;
    ## .
    return ("GLOB" eq ref ($fn) ? IO::File->new_from_fd ($fn->fileno ())
            : ($fn eq "-") ? (index ($mode, ">") > 0 ? \*STDOUT : \*STDIN)
            : do {
                $mode
                    //= "<";
                my $l
                    = length ($dev_fd_prefix);
                ## .
                ((substr ($fn, 0, $l) eq $dev_fd_prefix)
                 ? IO::File->new_from_fd (substr ($fn, $l), $mode)
                 : IO::File->new ($fn, $mode));
            });
}

sub mk_getline {
    my ($delim, @files) = @_;
    my ($fn, $fh);
    ## .
    sub {
        my $s;
        local $/
            = $delim;
        while (! defined ($fh) || ! defined ($s = $fh->getline ())) {
            if (defined ($fh)) {
                die ($fn, ": Error reading file: ", $!)
                    unless ($fh->eof ());
                $fh->close ();
            }
            last
                unless (@files > 0);
            $fn = shift (@files);
            $fh = open_file ($fn)
                or die ($fn, ": Cannot open file: ", $!);
            our ($debug_p);
            warn ("D: ", $fn, ": Opened\n") if ($debug_p);
        }
        ## .
        $s;
    };
}

sub dedup_1 {
    my ($orig_fn, @target_fns) = @_;
    our ($debug_p);
    warn ("D: Deduplicate ", scalar (Data::Dump::dump (\@_)), "\n") if ($debug_p);
    my $orig
        = open_file ($orig_fn)
        or die ($_, ": Cannot open source for reading: ", $!);
    my @targets = map {
        my $fh
            = open_file ($_, "+<")
            or warn ($_, ": Cannot open target for writing (skipped): ", $!);
        ## .
        (defined ($fh) ? ($fh) : ());
    } (@target_fns);
    unless (@target_fns > 0) {
        warn ("W: ", $orig_fn, ": Cannot deduplicate a single file; skipped\n");
        ## .
        return;
    }
    our ($preserve_times_p);
    my @utimes
        =  ($preserve_times_p ? map {
                my @s
                    = stat ($_);
                if (@s) {
                    ## .
                    [ @s[8, 9], $_ ]
                } else {
                    warn ("W: Failed to stat a filehandle: ",
                          $!, "; skipped\n");
                    ## .
                    undef;
                }
            } (@targets)
            : ());
    my @fds
        = map { $_->fileno (); } ($orig, @targets);
    # warn ("D: ", join (", ", @fds), "\n") if ($debug_p);
    my $len
        = $orig->sysseek (0, 2)
        or die ($orig_fn, ": Cannot determine file length: ", $!);
    our ($chunk_size, $min_size);
    my $max
        = $chunk_size // 0x1000000;
    my $min
        = $min_size // 0x10000;
    ## FIXME: assumed to be a power of two, or undef
    my $round_down
        = undef; ## 4096;
    ## FIXME: or infinity?
    my $actual_step
        = $max;
    for (my $off = 0; $off < $len; $off += min ($max, $actual_step)) {
        die ("Deduplication length too small",
             " (got ", $max, ", minimum ", $min, ")\n")
            unless ($max >= $min);
        my $delen
            = $actual_step
            = min ($max,
                   (defined ($round_down)
                    ? (($len - $off) & ~ (-1 + $round_down))
                    :   $len - $off));
        my $s
            = join ("", pack   ("QQSSL", $off, $delen,
                                -1 + @fds, 0, 0), map {
                                    pack ("qQQlL", $_, $off, 0, 0, 0);
                                } (@fds[1 .. $#fds]));
        # warn ("D: ioctl (", join (", ", $fds[0], $ion, unpack ("H48(H64)*", $s)), ")\n") if ($debug_p);
        warn ("D: ioctl (", $orig->fileno (), ", FIDEDUPERANGE, { ", join (", ", unpack ("QQSSL(qQQlL)*", $s)), " }") if ($debug_p);
        our ($dry_run_p);
        next
            if ($dry_run_p);
        my $r
            = ioctl ($orig, $ion, $s);
        my @ra 
            = unpack ("QQSSL(qQQlL)*", $s);
        # warn ("D: ioctl (", join (", ", $fds[0], $ion, unpack ("H48(H64)*", $s)), ") = ", $r, "\n") if ($debug_p);
        warn ("D: ioctl () => ", $r, ", { ", join (", ", @ra), " }") if ($debug_p);

        our ($once_p);
        if ($once_p) {
            warn ("D: --once is specified; going to the next set") if ($debug_p);
            last;
        }

        my $new
            = undef;
        for (my ($i, $stop) = (5, 5 + 5 * $#fds); $i < $stop; $i += 5) {
            warn ("D: considering ", scalar (Data::Dump::dump (@ra[$i .. (4 + $i)])), "\n") if ($debug_p);
            ## Skip if error or otherwize 0 bytes deduplicated
            next
                if ($ra[3 + $i] != 0 || $ra[2 + $i] <= 0);
            warn ("D: considering ", $ra[2 + $i], "\n") if ($debug_p);
            $new
                = $ra[2 + $i]
                unless (! defined ($new) || $ra[2 + $i] < $new);
        }
        $new
            = $min
            if (! defined ($new) || $new < $min);
        warn ("D: considering ", $new, "\n") if ($debug_p);
        $max
            = ($new < $max ? $new : $chunk_size // 0x1000000);
        warn ("D: adjusted deduplication length to ", $max, "\n") if ($debug_p);
    }

    foreach my $ut (@utimes) {
        next
            unless (defined ($ut));
        utime (@$ut)
            or warn ("W: Failed to set times on a filehandle: ",
                     $!, "; skipped\n");
    }

    ## .
    undef;
}

sub dedup_writable {
    my ($fn1, @fns) = @_;
    my (@fnw, @fnx);
    our ($ro_re);
    foreach my $fn (@fns) {
        ## NB: we are not going to list files excluded per --read-only-re=
        push (@{-w $fn ? \@fnw : \@fnx}, $fn)
            unless (defined ($ro_re) && $fn =~ $ro_re);
    }
    if (@fnx > 0) {
        require Data::Dump;
        warn ("W: These files are not writable (skipped): ",
              scalar (Data::Dump::dump (\@fnx)), "\n");
    }

    ## .
    if (@fnw > 0) {
        dedup_1 ($fn1, @fnw);
    } else {
        warn ("W: ", $fn1, ": Cannot deduplicate a single file; skipped\n");
    }
}

### main

Getopt::Long::Configure (qw (gnu_compat));

our ($debug_p, $dry_run_p, $preserve_times_p)
    = (0, 0, 0);
our ($chunk_size, $min_size);
our ($once_p)
    = (0);
my ($zero_p)
    = (0);
my ($key_re_s, @ignore_re_s, @ro_re_s);
my (@files_from);

my %cmdline_options =  (q (chunk-size=i)    => \$chunk_size,
                        q (debug!)          => \$debug_p,
                        q (files-from=s)    => \@files_from,
                        q (ignore-regexp=s) => \@ignore_re_s,
                        q (key-regexp=s)    => \$key_re_s,
                        q (n|dry-run!)  => \$dry_run_p,
                        q (minimum-size=i)  => \$min_size,
                        q (once!)           => \$once_p,
                        q (t|preserve-times!)   => \$preserve_times_p,
                        q (read-only-regexp=s)  => \@ro_re_s,
                        q (z|zero-terminated!)  => \$zero_p);
my $parsable_p
    = Getopt::Long::GetOptions (%cmdline_options)
    or die ("Cannot parse command line arguments");
warn ("W: No subsecond resolution timestamp support for",
      " --preserve-times\n")
    if ($preserve_times_p);
die ("Non-option arguments not supported alongside --files-from=\n")
    if (@ARGV > 0 && @files_from > 0);
die ("Non-option arguments not supported alongside --key-regexp=\n")
    if (@ARGV > 0 && defined ($key_re_s));
die ("No files given; use either non-option arguments or --files-from=\n")
    unless (@ARGV > 0 || @files_from > 0);
my $key_re
    = (defined ($key_re_s) ? qr {$key_re_s}sop : undef);
my @ignore_re_l
    = map { qr {$_}so; } (@ignore_re_s);
my @ro_re_l
    = map { qr {$_}so; } (@ro_re_s);
## FIXME: is joining compiled REs future-proof?
my $ignore_re
    = (@ignore_re_l < 1 ? undef
       : do { my $re = join ("|", @ignore_re_l); qr {$re}so; });
our $ro_re
    = (@ro_re_l < 1 ? undef
       : do { my $re = join ("|", @ro_re_l); qr {$re}so; });

require Data::Dump
    if ($debug_p);

if (@ARGV > 0) {
    my @files
        = (!     defined ($ignore_re) ? (@ARGV)
           : grep { $_ !~ $ignore_re; } (@ARGV));
    @files
        = ($files[0], grep { $_ !~ $ro_re; } (@files[1 .. $#files]))
        if (defined ($ro_re));
    if (@files < 1) {
        die ("No files to deduplicate\n");
    } elsif (@files < 2) {
        die ($files[0], ": Cannot deduplicate a single file\n");
    }
    dedup_1 (@files);
    ## .
    exit (0);
}

my $delim
    = ($zero_p ? "\000" : "\n");
my $next_f
    = mk_getline ($delim, @files_from);

if (defined ($key_re)) {
    my %sets;
    while (defined (my $ent = $next_f->())) {
        do { local $/ = $delim; chomp ($ent); };
        unless ($ent =~ $key_re) {
            warn ("W: Entry does not match --key-re= given; skipped\n");
            next;
        }
        my ($k, $f)
            = ($1 // ${^MATCH}, ${^PREMATCH} . ${^POSTMATCH});
        if (exists ($sets{$k})) {
            push (@{$sets{$k}}, $f);
        } else {
            $sets{$k}
                = [ $f ];
        }
    }

    ## FIXME: should compare the contents here and splits the sets if needed
    foreach my $k (keys (%sets)) {
        my @acc
            = @{$sets{$k}};
        dedup_writable (@acc)
            if (@acc > 1);
    }
} else {
    my @acc;
  FILENAME:
    while (defined (my $f = $next_f->())) {
        do { local $/ = $delim; chomp ($f); };
        ## Empty filename signals end of a set
        if ("" eq $f) {
            dedup_writable (@acc)
                if (@acc > 0);
            @acc
                = ();
            next;
        }
        next FILENAME
            if ($f =~ $ignore_re);
        push (@acc, $f);
    }
    dedup_writable (@acc)
        if (@acc > 0);
}

### Emacs trailer
## Local variables:
## coding: us-ascii
## fill-column: 72
## indent-tabs-mode: nil
## ispell-local-dictionary: "american"
## End:
### fdedup.perl ends here
