dotfiles-yadm/.scripts/msrsync

#!/usr/bin/env python2
# -*- coding: utf-8 -*-

# Copyright 2017 Jean-Baptiste Denis <jbd@jbdenis.net>

# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License version 3, as published
# by the Free Software Foundation.
#
# This file includes a copy of the BSD licensed options.py file from the bup project
# See https://github.com/bup/bup/blob/master/lib/bup/options.py

VERSION = '20170730'

"""
Simple program from a very specific need. I needed to transfer multiple
terabytes of data using rsync. I can use multiple rsync in parallel to
improve the throughput but i didn't want to think about what data each rsync
should transfer, that's why i wrote this program.

I'm targeting a single file self sufficient python 2.6 program. Why 2.6 ? Because RHEL6 and
derivated. So please be indulgent regarding the pyton code. But feel free to make suggestions
to improve it in ways that keep it compatible with python 2.6.

This script build files lists to feed rsync with. It builds files lists whose total
disk size is below a provided limit and whose total number is below a provided limit.

Inspired by the fpsync tool from the fpart project. You should have a look since fpsync
is much more powerful right now and has been used to move around terabytes
of data already. See https://github.com/martymac/fpart
"""

# TODO
# - handle remote-shell src or dest dir, like a normal rsync
# - verbose, debug, multiprocessing compliant output

DEFAULT_RSYNC_OPTIONS = "-aS --numeric-ids"

MSRSYNC_OPTSPEC = """
msrsync [options] [--rsync "rsync-options-string"] SRCDIR [SRCDIR2...] DESTDIR
msrsync --selftest
--
 msrsync options:
p,processes=   number of rsync processes to use [1]
f,files=       limit buckets to <files> files number [1000]
s,size=        limit partitions to BYTES size (1024 suffixes: K, M, G, T, P, E, Z, Y) [1G]
b,buckets=     where to put the buckets files (default: auto temporary directory)
k,keep         do not remove buckets directory at the end
j,show         show bucket directory
P,progress     show progress
stats          show additional stats
d,dry-run      do not run rsync processes
v,version      print version
 rsync options:
r,rsync=       MUST be last option. rsync options as a quoted string ["%s"]. The "--from0 --files-from=... --quiet --verbose --stats --log-file=..." options will ALWAYS be added, no matter what. Be aware that this will affect all rsync *from/filter files if you want to use them. See rsync(1) manpage for details.
 self-test options:
t,selftest     run the integrated unit and functional tests
e,bench        run benchmarks
g,benchshm     run benchmarks in /dev/shm or the directory in $SHM environment variable
""" % DEFAULT_RSYNC_OPTIONS


RSYNC_EXE = None

EOPTION_PARSER = 97
EPYTHON_VERSION = 10
EBUCKET_DIR_NOEXIST = 11
EBUCKET_DIR_PERMS = 12
EBUCKET_DIR_OSERROR = 12
EBUCKET_FILE_CREATE = 13
EBIN_NOTFOUND = 14
ESRC_NOT_DIR = 15
ESRC_NO_ACCESS = 16
EDEST_NO_ACCESS = 17
EDEST_NOT_DIR = 18
ERSYNC_OPTIONS_CHECK = 19
ERSYNC_TOO_LONG = 20
ERSYNC_JOB = 21
ERSYNC_OK = 22
EDEST_IS_FILE = 23
EDEST_CREATE = 24
ENEED_ROOT = 25
EBENCH = 26
EMSRSYNC_INTERRUPTED = 27

TYPE_RSYNC = 0
TYPE_RSYNC_SENTINEL = 1
MSG_STDERR = 10
MSG_STDOUT = 11
MSG_PROGRESS = 12

# pylint: disable=wrong-import-position

import os
import sys
import platform
import datetime
import tempfile
import shutil
import gzip
import time
import multiprocessing
import traceback
import shlex
import subprocess
import threading
import itertools
import random
import unittest
import signal
import timeit

STDOUT_ENCODING = sys.stdout.encoding if None else 'utf8'

def _e(value):
    # pylint: disable=invalid-name
    """
    dirty helper
    """
    if type(value) is unicode:
        return value.encode(STDOUT_ENCODING)
    else:
        return value

# Use the built-in version of scandir/walk if possible, otherwise
# use the scandir module version

USING_SCANDIR = True

try:
    if sys.version_info < (3, 5):
        from scandir import walk
        os.walk = walk
except ImportError:
    USING_SCANDIR = False

from multiprocessing.managers import SyncManager

G_MESSAGES_QUEUE = None

# Copy and paste from bup/options.py
# I'm disabling some pylint warning here

# pylint: disable=bad-whitespace, bad-continuation, unused-variable, invalid-name, wrong-import-position
# pylint: disable=reimported, missing-docstring, too-few-public-methods, unused-argument
# pylint: disable=too-many-instance-attributes, old-style-class, too-many-locals, multiple-statements
# pylint: disable=protected-access, superfluous-parens, pointless-string-statement, too-many-branches
# pylint: disable=too-many-statements, broad-except

# Copyright 2010-2012 Avery Pennarun and options.py contributors.
# All rights reserved.
#
# (This license applies to this file but not necessarily the other files in
# this package.)
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
#    1. Redistributions of source code must retain the above copyright
#       notice, this list of conditions and the following disclaimer.
#
#    2. Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in
#       the documentation and/or other materials provided with the
#       distribution.
#
# THIS SOFTWARE IS PROVIDED BY AVERY PENNARUN ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
"""Command-line options parser.
With the help of an options spec string, easily parse command-line options.

An options spec is made up of two parts, separated by a line with two dashes.
The first part is the synopsis of the command and the second one specifies
options, one per line.

Each non-empty line in the synopsis gives a set of options that can be used
together.

Option flags must be at the begining of the line and multiple flags are
separated by commas. Usually, options have a short, one character flag, and a
longer one, but the short one can be omitted.

Long option flags are used as the option's key for the OptDict produced when
parsing options.

When the flag definition is ended with an equal sign, the option takes
one string as an argument, and that string will be converted to an
integer when possible. Otherwise, the option does not take an argument
and corresponds to a boolean flag that is true when the option is
given on the command line.

The option's description is found at the right of its flags definition, after
one or more spaces. The description ends at the end of the line. If the
description contains text enclosed in square brackets, the enclosed text will
be used as the option's default value.

Options can be put in different groups. Options in the same group must be on
consecutive lines. Groups are formed by inserting a line that begins with a
space. The text on that line will be output after an empty line.
"""
import sys, os, textwrap, getopt, re, struct


def _invert(v, invert): # pragma: no cover
    if invert:
        return not v
    return v


def _remove_negative_kv(k, v): # pragma: no cover
    if k.startswith('no-') or k.startswith('no_'):
        return k[3:], not v
    return k,v


class OptDict(object): # pragma: no cover
    """Dictionary that exposes keys as attributes.

    Keys can be set or accessed with a "no-" or "no_" prefix to negate the
    value.
    """
    def __init__(self, aliases):
        self._opts = {}
        self._aliases = aliases

    def _unalias(self, k):
        k, reinvert = _remove_negative_kv(k, False)
        k, invert = self._aliases[k]
        return k, invert ^ reinvert

    def __setitem__(self, k, v):
        k, invert = self._unalias(k)
        self._opts[k] = _invert(v, invert)

    def __getitem__(self, k):
        k, invert = self._unalias(k)
        return _invert(self._opts[k], invert)

    def __getattr__(self, k):
        return self[k]


def _default_onabort(msg): # pragma: no cover
    sys.exit(97)


def _intify(v): # pragma: no cover
    try:
        vv = int(v or '')
        if str(vv) == v:
            return vv
    except ValueError:
        pass
    return v


def _atoi(v): # pragma: no cover
    try:
        return int(v or 0)
    except ValueError:
        return 0


def _tty_width(): # pragma: no cover
    # modification from the msrsync project : if sys.stderr is xStringIO or something else...
    if not hasattr(sys.stderr, "fileno"):
        return _atoi(os.environ.get('WIDTH')) or 70
    s = struct.pack("HHHH", 0, 0, 0, 0)
    try:
        import fcntl, termios
        s = fcntl.ioctl(sys.stderr.fileno(), termios.TIOCGWINSZ, s)
    except (IOError, ImportError):
        return _atoi(os.environ.get('WIDTH')) or 70
    (ysize,xsize,ypix,xpix) = struct.unpack('HHHH', s)
    return xsize or 70


class Options: # pragma: no cover
    """Option parser.
    When constructed, a string called an option spec must be given. It
    specifies the synopsis and option flags and their description.  For more
    information about option specs, see the docstring at the top of this file.

    Two optional arguments specify an alternative parsing function and an
    alternative behaviour on abort (after having output the usage string).

    By default, the parser function is getopt.gnu_getopt, and the abort
    behaviour is to exit the program.
    """
    def __init__(self, optspec, optfunc=getopt.gnu_getopt,
                 onabort=_default_onabort):
        self.optspec = optspec
        self._onabort = onabort
        self.optfunc = optfunc
        self._aliases = {}
        self._shortopts = 'h?'
        self._longopts = ['help', 'usage']
        self._hasparms = {}
        self._defaults = {}
        self._usagestr = self._gen_usage()  # this also parses the optspec

    def _gen_usage(self):
        out = []
        lines = self.optspec.strip().split('\n')
        lines.reverse()
        first_syn = True
        while lines:
            l = lines.pop()
            if l == '--': break
            out.append('%s: %s\n' % (first_syn and 'usage' or '   or', l))
            first_syn = False
        out.append('\n')
        last_was_option = False
        while lines:
            l = lines.pop()
            if l.startswith(' '):
                out.append('%s%s\n' % (last_was_option and '\n' or '',
                                       l.lstrip()))
                last_was_option = False
            elif l:
                (flags,extra) = (l + ' ').split(' ', 1)
                extra = extra.strip()
                if flags.endswith('='):
                    flags = flags[:-1]
                    has_parm = 1
                else:
                    has_parm = 0
                g = re.search(r'\[([^\]]*)\]$', extra)
                if g:
                    defval = _intify(g.group(1))
                else:
                    defval = None
                flagl = flags.split(',')
                flagl_nice = []
                flag_main, invert_main = _remove_negative_kv(flagl[0], False)
                self._defaults[flag_main] = _invert(defval, invert_main)
                for _f in flagl:
                    f,invert = _remove_negative_kv(_f, 0)
                    self._aliases[f] = (flag_main, invert_main ^ invert)
                    self._hasparms[f] = has_parm
                    if f == '#':
                        self._shortopts += '0123456789'
                        flagl_nice.append('-#')
                    elif len(f) == 1:
                        self._shortopts += f + (has_parm and ':' or '')
                        flagl_nice.append('-' + f)
                    else:
                        f_nice = re.sub(r'\W', '_', f)
                        self._aliases[f_nice] = (flag_main,
                                                 invert_main ^ invert)
                        self._longopts.append(f + (has_parm and '=' or ''))
                        self._longopts.append('no-' + f)
                        flagl_nice.append('--' + _f)
                flags_nice = ', '.join(flagl_nice)
                if has_parm:
                    flags_nice += ' ...'
                prefix = '    %-20s  ' % flags_nice
                argtext = '\n'.join(textwrap.wrap(extra, width=_tty_width(),
                                                initial_indent=prefix,
                                                subsequent_indent=' '*28))
                out.append(argtext + '\n')
                last_was_option = True
            else:
                out.append('\n')
                last_was_option = False
        return ''.join(out).rstrip() + '\n'

    def usage(self, msg=""):
        """Print usage string to stderr and abort."""
        sys.stderr.write(self._usagestr)
        if msg:
            sys.stderr.write(msg)
        e = self._onabort and self._onabort(msg) or None
        if e:
            raise e

    def fatal(self, msg):
        """Print an error message to stderr and abort with usage string."""
        msg = '\nerror: %s\n' % msg
        return self.usage(msg)

    def parse(self, args):
        """Parse a list of arguments and return (options, flags, extra).

        In the returned tuple, "options" is an OptDict with known options,
        "flags" is a list of option flags that were used on the command-line,
        and "extra" is a list of positional arguments.
        """
        try:
            (flags,extra) = self.optfunc(args, self._shortopts, self._longopts)
        except getopt.GetoptError, e:
            self.fatal(e)

        opt = OptDict(aliases=self._aliases)

        for k,v in self._defaults.iteritems():
            opt[k] = v

        for (k,v) in flags:
            k = k.lstrip('-')
            if k in ('h', '?', 'help', 'usage'):
                self.usage()
            if (self._aliases.get('#') and
                  k in ('0','1','2','3','4','5','6','7','8','9')):
                v = int(k)  # guaranteed to be exactly one digit
                k, invert = self._aliases['#']
                opt['#'] = v
            else:
                k, invert = opt._unalias(k)
                if not self._hasparms[k]:
                    assert(v == '')
                    v = (opt._opts.get(k) or 0) + 1
                else:
                    v = _intify(v)
            opt[k] = _invert(v, invert)
        return (opt,flags,extra)

# pylint: enable=bad-whitespace, bad-continuation, unused-variable, invalid-name
# pylint: enable=reimported, missing-docstring, too-few-public-methods, unused-argument
# pylint: enable=too-many-instance-attributes, old-style-class, too-many-locals, multiple-statements
# pylint: enable=protected-access, superfluous-parens, pointless-string-statement, too-many-branches
# pylint: enable=too-many-statements, broad-except

def print_message(message, output=MSG_STDOUT):
    """
    Add message to the message queue
    """
    G_MESSAGES_QUEUE.put({"type": output, "message": message})


def print_update(data):
    """
    Print 'data' on the same line as before
    """
    sys.stdout.write("\r\x1b[K"+data.__str__())
    sys.stdout.flush()


class BucketError(RuntimeError):
    """
    Exception for bucket related error
    """
    pass


def _check_python_version():
    """
    Stupid python version checker
    """
    major, minor, _ = platform.python_version_tuple()
    if major == 2 and minor < 6:
        python26_release = datetime.datetime(2008, 10, 1)
        now = datetime.datetime.now()
        years = (now - python26_release).days / 365
        sys.stderr.write(("You need python >= 2.6 to run this program (more than %d years old)." + os.linesep) % years)
        sys.exit(EPYTHON_VERSION)


def get_human_size(num, power="B"):
    """
    Stolen from the ps_mem.py project for nice size output :)
    """
    powers = ["B", "K", "M", "G", "T", "P", "E", "Z", "Y"]
    while num >= 1000: #4 digits
        num /= 1024.0
        power = powers[powers.index(power)+1]
    return "%.1f %s" % (num, power)


def human_size(value):
    """
    parse the provided human size (with multiples K, M, G, T, E, P, Z, Y)
    and return bytes
    """

    if value.isdigit():
        return int(value)

    if not value[:-1].isdigit():
        return None

    m2s = {'K': 1024, \
           'M': 1024*1024, \
           'G': 1024*1024*1024, \
           'T': 1024*1024*1024*1024, \
           'P': 1024*1024*1024*1024*1024, \
           'E': 1024*1024*1024*1024*1024*1024, \
           'Z': 1024*1024*1024*1024*1024*1024*1024, \
           'Y': 1024*1024*1024*1024*1024*1024*1024*1024}

    size = int(value[:-1])
    multiple = value[-1]

    if multiple not in m2s.keys():
        return None

    return size * m2s[multiple]


def crawl(path, relative=False):
    """
    Simple generator around os.walk that will
    yield (size, fullpath) tuple for each file or link
    underneath path.

    If relative is True, the path will be relative to path, without
    any leading ./ or /. For exemple, crawl("/home/jbdenis/Code", relative=True)
    will yield (42, "toto") for "/home/jbdenis/Code/toto" file
    """
    def onerror(oserror):
        """
        helper
        """
        print_message("msrsync crawl: %s" % oserror, MSG_STDERR)

    root_size = len(path) if relative else 0

    for root, dirs, files in os.walk(path, onerror=onerror):
        # we want empty dir to be listed in bucket
        if len(dirs) == 0 and len(files) == 0:
            rpath = root[root_size:]
            try:
                yield os.lstat(root).st_size, rpath
            except OSError, err:
                print_message("msrsync crawl: %s" % err, MSG_STDERR)
                continue

        dir_links = [d for d in dirs if os.path.islink(os.path.join(root, d))]

        for name in itertools.chain(files, dir_links):
            fullpath = os.path.join(root, name)
            try:
                size = os.lstat(fullpath).st_size
            except OSError, err:
                print_message("msrsync crawl: %s" % err, MSG_STDERR)
                continue

            rpath = fullpath[root_size:]
            yield size, rpath


def buckets(path, filesnr, size):
    """
    Split files underneath path in buckets less than <size> bytes in total
    or containing <filesnr> files maximum.
    """
    bucket_files_nr = 0
    bucket_size = 0
    bucket = list()


    # if we've got a trailing slash in the path, we want
    # to sync the content of the path. I
    # if we don't have a trailing slash, we want to sync the path
    # itself
    # Example:
    # os.path.split("/home/jbdenis/Code")[1] will return 'Code'
    # os.path.split("/home/jbdenis/Code/")[1] will return ''
    base = os.path.split(path)[1]

    for fsize, rpath in crawl(path, relative=True):
        bucket.append(os.path.join(base, rpath.lstrip(os.sep)))
        bucket_files_nr += 1
        bucket_size += fsize

        if bucket_size >= size or bucket_files_nr >= filesnr:
            yield (bucket_files_nr, bucket_size, bucket)
            bucket_size = 0
            bucket_files_nr = 0
            bucket = list()

    if bucket_files_nr > 0:
        yield (bucket_files_nr, bucket_size, bucket)


def _valid_rsync_options(options, rsync_opts):
    """
    Check for weird stuff in rsync options
    """
    rsync_args = rsync_opts.split()
    for opt in rsync_args:
        if opt.startswith("--delete"):
            options.fatal("Cannot use --delete option type with msrsync. It would lead to disaster :)")


def parse_cmdline(cmdline_argv):
    """
    command line parsing of msrsync using bup/options.py
    See https://github.com/bup/bup/blob/master/lib/bup/options.py
    """

    # If I want to run this script on RHEL6 and derivatives without installing any dependencies
    # except python and rsync, I can't rely on argparse which is only available in python >= 2.7
    # standard library. I don't want to rely on the installation of python-argparse for python 2.6
    options = Options(MSRSYNC_OPTSPEC)

    # it looks soooo fragile, but it works for me here.
    # this block extracts the provided rsync options if present
    # it assumes thats the SRC... DEST arguments are at the end of the command line
    # I cannot use options parser to parse --rsync since some msrsync use some options
    # name already used by rsync. So I only parse the command line up to the --rsync token
    # and ugly parse what I want. Any better idea ?
    if "-r" in cmdline_argv:
        idx = cmdline_argv.index("-r")
        cmdline_argv[idx] = "--rsync"

    if "--rsync" in cmdline_argv:
        idx = cmdline_argv.index("--rsync")
        # we parse the command line up to --rsync options marker
        (opt, _, extra) = options.parse(cmdline_argv[1:idx])
        if len(cmdline_argv[idx:]) < 4: # we should have, at least, something like --rsync "-avz --whatever" src dest
            options.fatal('You must provide a source, a destination and eventually rsync options with --rsync')
        opt.rsync = opt.r = cmdline_argv[idx+1] # pylint: disable=invalid-name, attribute-defined-outside-init
        _valid_rsync_options(options, opt.rsync)
        srcs, dest = cmdline_argv[idx+2:-1], cmdline_argv[-1]
    else:
        # no --rsync options marker on the command line.
        (opt, _, extra) = options.parse(cmdline_argv[1:])
        if opt.selftest or opt.bench or opt.benchshm or opt.version: # early exit
            return opt, [], ""
        opt.rsync = opt.r = DEFAULT_RSYNC_OPTIONS # pylint: disable=attribute-defined-outside-init
        if not extra or len(extra) < 2:
            options.fatal('You must provide a source and a destination')
        srcs, dest = extra[:-1], extra[-1]

    size = human_size(str(opt.size))
    if not size:
        options.fatal("'%s' does not look like a valid size value" % opt.size)
    try:
        # pylint: disable=attribute-defined-outside-init, invalid-name
        opt.files = opt.f = int(opt.f)
    except ValueError:
        options.fatal("'%s' does not look like a valid files number value" % opt.f)
    opt.size = opt.s = size # pylint: disable=invalid-name, attribute-defined-outside-init
    opt.compress = False # pylint: disable=attribute-defined-outside-init

    return opt, srcs, dest


def rmtree_onerror(func, path, exc_info):
    """
    Error handler for shutil.rmtree.
    """
    # pylint: disable=unused-argument
    print >>sys.stderr, "Error removing", path


def write_bucket(filename, bucket, compress=False):
    """
    Dump bucket filenames in a optionnaly compressed file
    """
    try:
        fileno, path = filename
        if not compress:
            with os.fdopen(fileno, 'wb') as bfile:
                for entry in bucket:
                    bfile.write(entry + '\0')
        else:
            os.close(fileno)
            with gzip.open(path, 'wb') as bfile:
                for entry in bucket:
                    bfile.write(entry)
    except IOError, err:
        raise BucketError("Cannot write bucket file %s: %s" % (path, err))


def consume_queue(jobs_queue):
    """
    Simple helper around a shared queue
    """
    while True:
        item = jobs_queue.get()
        if item is StopIteration:
            return
        yield item


# stolen from Forest http://stackoverflow.com/questions/1191374/subprocess-with-timeout
def kill_proc(proc, timeout):
    """ helper function for run """
    timeout["value"] = True
    try:
        proc.kill()
    except OSError:
        pass


# stolen and adapted from Forest http://stackoverflow.com/questions/1191374/subprocess-with-timeout
def run(cmd, capture_stdout=False, capture_stderr=False, timeout_sec=sys.maxsize):
    """ run function with a timeout """
    try:
        stdout_p = subprocess.PIPE if capture_stdout else None
        stderr_p = subprocess.PIPE if capture_stderr else None
        proc = subprocess.Popen(shlex.split(cmd), stdout=stdout_p, stderr=stderr_p)
        timeout = {"value": False}
        timer = threading.Timer(timeout_sec, kill_proc, [proc, timeout])
        starttime = time.time()
        timer.start()
        stdout, stderr = proc.communicate()
        if stdout is None:
            stdout = ""
        if stderr is None:
            stderr = ""
        timer.cancel()
        elapsed = time.time() - starttime
    except OSError, err:
        return -1, "", "Cannot launch %s: %s" % (cmd, err), False, 0
    except KeyboardInterrupt:
        if vars().has_key('timer'):
            timer.cancel()
        if proc:
            if proc.stdout:
                proc.stdout.close()
            if proc.stderr:
                proc.stderr.close()
            proc.terminate()
            proc.wait()
        return 666, "", "Interrupted", False, 0

    return proc.returncode, stdout.decode("utf-8", "replace"), stderr.decode("utf-8", "replace"), timeout["value"], elapsed


# stolen from stackoverflow (http://stackoverflow.com/a/377028)
def which(program):
    """
    Python implementation of the which command
    """
    def is_exe(fpath):
        """ helper """
        return os.path.isfile(fpath) and os.access(fpath, os.X_OK)

    fpath, _ = os.path.split(program)
    if fpath:
        if is_exe(program):
            return program
    else:
        for path in os.environ["PATH"].split(os.pathsep):
            exe_file = os.path.join(path, program)
            if is_exe(exe_file):
                return exe_file

    return None


def _check_rsync_options(options):
    """
    Build a command line given the rsync options string
    and try to execute it on empty directory
    """
    rsync_cmd = None
    try:
        src = tempfile.mkdtemp()
        dst = tempfile.mkdtemp()
        rsync_log_fd, rsync_log = tempfile.mkstemp()
        rsync_cmd = "%s %s %s %s" % (RSYNC_EXE, options + ' --quiet --stats --verbose --from0 --log-file %s' % rsync_log, src + os.sep, dst)
        ret, _, stderr, timeout, _ = run(rsync_cmd, timeout_sec=60) # this should not take more than one minute =)
        if timeout:
            print >>sys.stderr, '''Error during rsync options check command "%s": took more than 60 seconds !''' % rsync_cmd
            sys.exit(ERSYNC_OPTIONS_CHECK)
        elif ret != 0:
            print >>sys.stderr, '''Error during rsync options check command "%s": %s''' % (rsync_cmd, 2*os.linesep + stderr)
            sys.exit(ERSYNC_OPTIONS_CHECK)
    except OSError, err:
        if rsync_cmd:
            print >>sys.stderr, '''Error during rsync options check command "%s": %s''' % (rsync_cmd, 2*os.linesep + err)
        else:
            print >>sys.stderr, '''Error during rsync options check ("%s"): %s''' % (options, 2*os.linesep + err)
        sys.exit(ERSYNC_OPTIONS_CHECK)
    finally:
        try:
            os.rmdir(src)
            os.rmdir(dst)
            os.close(rsync_log_fd)
            os.remove(rsync_log)
        except OSError:
            pass


def run_rsync(files_from, rsync_opts, src, dest, timeout=3600*24*7):
    """
    Perform rsync using the --files-from option
    """
    # this looks very close to the _check_rsync_options function...
    # except the error message
    rsync_log = files_from + '.log'
    rsync_cmd = '%s %s %s "%s" "%s"' % (RSYNC_EXE, rsync_opts, "--quiet --verbose --stats --from0 --files-from=%s --log-file %s" % (files_from, rsync_log), src, dest)
    #rsync_cmd = "%s %s %s %s %s" % (RSYNC_EXE, rsync_opts, "--quiet --from0 --files-from=%s" % (files_from,), src, dest)

    rsync_result = dict()
    rsync_result["rcode"] = -1
    rsync_result["msg"] = None
    rsync_result["cmdline"] = rsync_cmd
    rsync_result["log"] = rsync_log

    try:
        ret, _, _, timeout, elapsed = run(rsync_cmd, timeout_sec=timeout)

        rsync_result["rcode"] = ret
        rsync_result["elapsed"] = elapsed

        if timeout:
            rsync_result["errcode"] = ERSYNC_TOO_LONG
        elif ret != 0:
            rsync_result["errcode"] = ERSYNC_JOB

    except OSError, err:
        rsync_result["errcode"] = ERSYNC_JOB
        rsync_result["msg"] = str(err)

    return rsync_result


def rsync_worker(jobs_queue, monitor_queue, options, dest):
    """
    The queue will contains filenames of file to handle by individual rsync processes
    """

    try:
        for src, files_from, bucket_files_nr, bucket_size in consume_queue(jobs_queue):
            if not options.dry_run:
                rsync_result = run_rsync(files_from, options.rsync, src, dest)
            else:
                rsync_result = dict(rcode=0, elapsed=0, errcode=0, msg='')
            rsync_mon_result = {"type": TYPE_RSYNC, "rsync_result": rsync_result, "size": bucket_size, "files_nr": bucket_files_nr, "jq_size": jobs_queue.qsize()}
            monitor_queue.put(rsync_mon_result)
    except (KeyboardInterrupt, SystemExit):
        pass
    finally:
        jobs_queue.put(StopIteration)
        # we insert a sentinel value to inform the monitor this process fnished
        monitor_queue.put({"type": TYPE_RSYNC_SENTINEL, "pid": os.getpid()})


def handle_rsync_error_result(rsync_result):
    """"
    Helper
    """
    msg = ''

    if rsync_result["msg"] is not None:
        msg = rsync_result["msg"]

    if rsync_result["errcode"] == ERSYNC_TOO_LONG:
        print_message(("rsync command took too long and has been killed (see '%s' rsync log file): %s\n" + rsync_result["cmdline"]) % (rsync_result["log"], msg), MSG_STDERR)
    elif rsync_result["errcode"] == ERSYNC_JOB:
        print_message(("errors during rsync command (see '%s' rsync log file): %s\n" + rsync_result["cmdline"]) % (rsync_result["log"], msg), MSG_STDERR)
    else:
        print_message("unknown rsync_result status: %s" % rsync_result, MSG_STDERR)


def rsync_monitor_worker(monitor_queue, nb_rsync_processes, total_size, total_files_nr, crawl_time, total_time, options):
    """
    The monitor queue contains messages from the rsync workers
    """
    current_size = 0
    current_files_nr = 0
    current_elapsed = 0
    rsync_runtime = 0
    rsync_workers_stops = 0
    buckets_nr = 0
    rsync_errors = 0
    entries_per_second = 0
    bytes_per_second = 0

    try:
        start = timeit.default_timer()
        for result in consume_queue(monitor_queue):
            if result["type"] == TYPE_RSYNC_SENTINEL:
                # not needed, but we keep it for now
                rsync_workers_stops += 1
                continue
            if result["type"] != TYPE_RSYNC:
                print_message("rsync_monitor_worker process received an incompatile type message: %s" % result, MSG_STDERR)
                continue

            rsync_result = result["rsync_result"]

            if rsync_result["rcode"] != 0:
                rsync_errors += 1
                handle_rsync_error_result(rsync_result)
                continue

            buckets_nr += 1
            current_size += result["size"]
            current_files_nr += result["files_nr"]
            rsync_runtime += result["rsync_result"]["elapsed"]
            current_elapsed = timeit.default_timer() - start

            if current_elapsed > 0:
                bytes_per_second = current_size / current_elapsed
            else:
                bytes_per_second = 0

            if current_elapsed > 0:
                entries_per_second = current_files_nr / current_elapsed
            else:
                entries_per_second = 0

            if options.progress:
                print_message("[%d/%d entries] [%s/%s transferred] [%d entries/s] [%s/s bw] [monq %d] [jq %d]" % \
                              (current_files_nr,\
                               total_files_nr.value,\
                               get_human_size(current_size),\
                               get_human_size(total_size.value),\
                               entries_per_second,\
                               get_human_size(bytes_per_second),\
                               monitor_queue.qsize(),\
                               result["jq_size"]),\
                               MSG_PROGRESS)


        if rsync_errors > 0:
            print_message("\nmsrsync error: somes files/attr were not transferred (see previous errors)", MSG_STDERR)

        stats = dict()
        stats["errors"] = rsync_errors
        stats["total_size"] = total_size.value
        stats["total_entries"] = total_files_nr.value
        stats["buckets_nr"] = buckets_nr
        stats["bytes_per_second"] = bytes_per_second
        stats["entries_per_second"] = entries_per_second
        stats["rsync_workers"] = nb_rsync_processes
        stats["rsync_runtime"] = rsync_runtime
        stats["crawl_time"] = crawl_time.value
        stats["total_time"] = total_time.value

        monitor_queue.put(stats)

    except (KeyboardInterrupt, SystemExit):
        pass


def messages_worker(options):
    """
    This queue will contains messages to be print of the screen
    """

    last_msg_type = cur_msg_type = None
    try:
        for result in consume_queue(G_MESSAGES_QUEUE):
            if last_msg_type == MSG_PROGRESS:
                newline = os.linesep
            else:
                newline = ''

            cur_msg_type = result["type"]

            if cur_msg_type == MSG_PROGRESS:
                print_update(result["message"])
            elif cur_msg_type == MSG_STDOUT:
                print >>sys.stdout, _e(newline + result["message"])
            elif cur_msg_type == MSG_STDERR:
                print >>sys.stderr, _e(newline + result["message"])
            else:
                print >>sys.stderr, _e(newline + "Unknown message type '%s': %s" % (cur_msg_type, result))
            last_msg_type = cur_msg_type

    except (KeyboardInterrupt, SystemExit):
        pass
    finally:
        if last_msg_type == MSG_PROGRESS:
            print >>sys.stdout, ''

def start_rsync_workers(jobs_queue, monitor_queue, options, dest):
    """
    Helper to start rsync processes
    """
    processes = []
    for _ in xrange(options.processes):
        processes.append(multiprocessing.Process(target=rsync_worker, args=(jobs_queue, monitor_queue, options, dest)))
        processes[-1].start()
    return processes


def start_rsync_monitor_worker(monitor_queue, nb_rsync_processes, total_size, total_files_nr, crawl_time, total_time, options):
    """
    Helper to start rsync monitor process
    """
    proc = multiprocessing.Process(target=rsync_monitor_worker, args=(monitor_queue, nb_rsync_processes, total_size, total_files_nr, crawl_time, total_time, options))
    proc.start()
    return proc


def start_messages_worker(options):
    """
    Helper to start messages process
    """
    proc = multiprocessing.Process(target=messages_worker, args=(options,))
    proc.start()
    return proc


def multiprocess_mgr_init():
    """
    Explicit initializer for SyncManager in msrsync function
    """
    signal.signal(signal.SIGINT, signal.SIG_IGN)


def show_stats(msrsync_stat):
    """
    Show the stats from msrsync run
    """
    s = msrsync_stat
    if s["errors"] == 0:
        status = "SUCCESS"
    else:
        status = "FAILURE, %d rsync processe(s) had errors" % s["errors"]

    print "Status:", status
    print "Working directory:", os.getcwd()
    print "Command line:", " ".join(sys.argv)
    print "Total size: %s" % get_human_size(s["total_size"])
    print "Total entries: %s" % s["total_entries"]
    buckets_nr = s["buckets_nr"]
    print "Buckets number: %d" % buckets_nr
    if buckets_nr > 0:
        print "Mean entries per bucket: %d" % ((s["total_entries"] * 1.)/ buckets_nr)
        print "Mean size per bucket: %s" % get_human_size((s["total_size"] * 1.)/ buckets_nr)

    print "Entries per second: %d" % s["entries_per_second"]
    print "Speed: %s/s" % get_human_size(s["bytes_per_second"])
    print "Rsync workers: %d" % s["rsync_workers"]
    print "Total rsync's processes (%d) cumulative runtime: %.1fs" % (buckets_nr, s["rsync_runtime"])
    print "Crawl time: %.1fs (%.1f%% of total runtime)" % (s["crawl_time"], 100* s["crawl_time"]/s["total_time"])
    print "Total time: %.1fs" % s["total_time"]


def msrsync(options, srcs, dest):
    """
    multi-stream rsync reusable function
    It will copy srcs directories to dest honoring the options structure
    """
    global G_MESSAGES_QUEUE
    # pylint: disable=too-many-branches, too-many-locals
    try:
        if not options.buckets:
            options.buckets = tempfile.mkdtemp(prefix="msrsync-")
        else:
            if not os.path.exists(options.buckets):
                print >>sys.stderr, options.buckets, "bucket directory does not exist."
                sys.exit(EBUCKET_DIR_NOEXIST)
            if not os.access(options.buckets, os.W_OK):
                print >>sys.stderr, options.buckets, "bucket directory is not writable."
                sys.exit(EBUCKET_DIR_PERMS)
            options.buckets = tempfile.mkdtemp(prefix="msrsync-", dir=options.buckets)
    except OSError, err:
        print >>sys.stderr, '''Error with bucket directory creation: "%s"''' % err
        sys.exit(EBUCKET_DIR_OSERROR)

    if options.show:
        print "buckets dir is", options.buckets

    manager = SyncManager()
    #manager.start(multiprocess_mgr_init) # Oups... This is in python 2.7...
    manager.start()

    total_size = manager.Value('i', 0)
    total_files_nr = manager.Value('i', 0)
    crawl_time = manager.Value('f', 0)
    total_time = manager.Value('f', 0)

    monitor_queue = manager.Queue() # pylint: disable=no-member
    jobs_queue = manager.Queue() # pylint: disable=no-member
    G_MESSAGES_QUEUE = manager.Queue() # pylint: disable=no-member

    rsync_workers_procs = start_rsync_workers(jobs_queue, monitor_queue, options, dest)
    rsync_monitor_worker_proc = start_rsync_monitor_worker(monitor_queue, options.processes, total_size, total_files_nr, crawl_time, total_time, options)
    messages_worker_proc = start_messages_worker(options)

    crawl_start = timeit.default_timer()

    try:
        total_size.value = 0
        bucket_nr = 0
        for src in srcs:
            # do we want to sync to content of src or src itself ?
            # os.path.split("/home/jbdenis/Code")[0] will return '/home/jbdenis'
            # os.path.split("/home/jbdenis/Code/")[0] will return '/home/jbdenis/Code'
            # We need that to correctly generate the --files-from files and the src path
            # that will be used on the rsync command line

            # if src is a single directory, without trailing slash
            # os.path.split("src") returns ('', 'src')
            head, tail = os.path.split(src)
            if head == '':
                src_base = os.getcwd()
            else:
                src_base = head

            for bucket_files_nr, bucket_size, bucket in buckets(src, options.files, options.s):
                total_size.value += bucket_size
                total_files_nr.value += bucket_files_nr
                # from the rsync man page (--files-from part):
                # NOTE: sorting the list of files in the --files-from input
                # helps rsync to be more efficient, as it will avoid re-visiting
                # the path elements that are shared  between  adjacent
                # entries.  If the input is not sorted, some path elements
                # (implied directories) may end up being scanned multiple times,
                # and rsync will eventually unduplicate them after they
                # get turned into file-list elements.
                bucket.sort()
                # the idea is prevent to have too much files in a single directory
                d1s = str(bucket_nr / 1024).zfill(8)
                # with bucket_nr == 12, d1s == i'00000000'
                # tdir = options.buckets/0000/0000
                # with bucket_nr == 11058, d1s == '00000100'
                # tdir = options.buckets/0000/0012
                # with bucket_nr == 148472185, d1s == '00144992' (148472181/1024)
                # tdir = options.buckets/0014/4992
                try:
                    tdir = os.path.join(options.buckets, d1s[:4], d1s[4:])
                    if not os.path.exists(tdir):
                        os.makedirs(tdir)
                    # the fd is closed within write_bucket
                    fileno, filename = tempfile.mkstemp(dir=tdir)
                except OSError, err:
                    print_message('msrsync scan: cannot create temporary bucket file: "%s"' % err, MSG_STDERR)
                    continue
                write_bucket((fileno, filename), bucket, options.compress)
                bucket_nr += 1
                jobs_queue.put((src_base, filename, bucket_files_nr, bucket_size))

        crawl_time.value = timeit.default_timer() - crawl_start

        jobs_queue.put(StopIteration)

        for worker in rsync_workers_procs:
            worker.join()

        total_time.value = timeit.default_timer() - crawl_start

        monitor_queue.put(StopIteration)
        rsync_monitor_worker_proc.join()

        G_MESSAGES_QUEUE.put(StopIteration)
        messages_worker_proc.join()

        # we retrieve the last element from the queue, which is a stats dict
        run_stats = monitor_queue.get()
        if options.stats:
            show_stats(run_stats)

        return run_stats["errors"]

    except (KeyboardInterrupt, SystemExit):
        for worker in rsync_workers_procs:
            worker.terminate()
            worker.join()

        rsync_monitor_worker_proc.terminate()
        rsync_monitor_worker_proc.join()

        messages_worker_proc.terminate()
        messages_worker_proc.join()
    except BucketError, err:
        print >>sys.stderr, err
    except Exception: # pylint: disable=broad-except
        print >>sys.stderr, "Uncaught exception:" + os.linesep + traceback.format_exc()
    finally:
        manager.shutdown()
        if options.buckets is not None and not options.keep:
            shutil.rmtree(options.buckets, onerror=rmtree_onerror)


def _check_executables():
    """
    Get the full path of somes binaries
    """
    global RSYNC_EXE # pylint: disable=global-statement

    exes = ["rsync"]
    paths = dict()

    for exe in exes:
        prog = which(exe)
        if not prog:
            print >>sys.stderr, "Cannot find '%s' executable in PATH." % exe
            sys.exit(EBIN_NOTFOUND)
        paths[exe] = prog

    RSYNC_EXE = paths["rsync"]


def _check_srcs_dest(srcs, dest):
    """
    Check that the supplied arguments are valid
    """
    for src in srcs:
        if not os.path.isdir(src):
            print >>sys.stderr, "Source '%s' is not a directory" % src
            sys.exit(ESRC_NOT_DIR)
        if not os.access(src, os.R_OK|os.X_OK):
            print >>sys.stderr, "No access to source directory '%s'" % src
            sys.exit(ESRC_NO_ACCESS)

    # dest may not exist, just as in rsync : "rsync -a src dst" will create
    # destination if it does not exist. But I prefer to create it here to handle
    # potential errors
    if not os.path.exists(dest):
        try:
            os.mkdir(dest)
        except OSError, err:
            print >>sys.stderr, "Error creating destination directory '%s': %s" % (dest, err)
            sys.exit(EDEST_CREATE)

    if os.path.isfile(dest):
        print >>sys.stderr, "Destination '%s' already exists and is a file" % dest
        sys.exit(EDEST_IS_FILE)

    if os.path.isdir(dest) and not os.access(dest, os.W_OK|os.X_OK):
        print >>sys.stderr, "Destination directory '%s' not writable" % dest
        sys.exit(EDEST_NO_ACCESS)


def _create_level_entries(cwd, max_entries, files_pct):
    """
    Helper for testing purpose

    It will create "max_entries" entries in "cwd" with
    files_pct percent of files. The rest will be directories
    """
    dirs = list()
    files_nr = 0
    level_entries = random.randint(0, max_entries)

    for _ in xrange(level_entries):
        if random.randint(1, 100) <= files_pct:
            fhandle, _ = tempfile.mkstemp(dir=cwd)
            os.close(fhandle)
            files_nr += 1
        else:
            dirname = tempfile.mkdtemp(dir=cwd)
            dirs.append(dirname)

    return files_nr + len(dirs), dirs


def _create_fake_tree(cwd, total_entries, max_entries_per_level, max_depth, files_pct):
    """
    Helper for testing purpose

    This function will create a tree of 'total_entries' files and dirs in cwd, trying
    not to put more that "max_entries_per_level" entries at each level and if possible
    not to exceed a "max_depth" depth.

    The ratio of files/dirs is controlled with "files_pct". For example, if files_pct is "90",
    this function will create a tree with 90% of files and 10% of directories
    """
    dir_queue = list()
    dir_queue.append(cwd)
    curr_entries_number = 0

    root_len = len(cwd)

    while curr_entries_number < total_entries:
        if len(dir_queue) == 0:
            dir_queue.append(cwd)

        cur = dir_queue.pop()

        if cur[root_len:].count(os.sep) >= max_depth:
            continue

        entries_to_create = total_entries - curr_entries_number
        if entries_to_create < max_entries_per_level:
            max_entries = entries_to_create
        else:
            max_entries = max_entries_per_level

        entries, dirs = _create_level_entries(cur, max_entries, files_pct)
        curr_entries_number += entries
        dir_queue.extend(dirs)

    return curr_entries_number


def _compare_trees(first, second):
    """
    Helper for testing purpose

    This function takes two paths, generate a listing.
    and compare them. The goal is to determine if two trees are "equal". See note.

    Note: since os.walk herits the behaviour of os.listdir, we can have different
    walk listing order for the same tree. We need to sort it before comparing anything.
    It does not scale. Ideally, the file listings would have been written in temporaries
    files and then merge-sorted (like sort(1)). Use only on tree with reasonnable size
    """
    first_list = second_list = list()
    for _, cur in crawl(first, relative=True):
        first_list.append(cur)

    first_list.sort()

    for _, cur in crawl(second, relative=True):
        first_list.append(cur)

    second_list.sort()

    return first_list == second_list


class TestHelpers(unittest.TestCase):
    """
    Test the various function helpers
    """
    # pylint: disable=too-many-public-methods
    def test_get_human_size(self):
        """ convert bytes to human readable string """
        val = get_human_size(1024)
        self.assertEquals(val, '1.0 K')

    def test_get_human_size2(self):
        """ convert bytes to human readable string """
        val = get_human_size(1024*1024)
        self.assertEquals(val, '1.0 M')

    def test_human_size(self):
        """ convert human readable size to bytes """
        val = human_size("1024")
        self.assertEquals(val, 1024)

    def test_human_size2(self):
        """ convert human readable size to bytes """
        val = human_size("1M")
        self.assertEquals(val, 1024*1024)

    def test_human_size3(self):
        """ wrongly formatted size """
        val = human_size("10KK")
        self.assertEquals(val, None)

    def test_human_size4(self):
        """ bad suffix  """
        val = human_size("10Q")
        self.assertEquals(val, None)


class TestOptionsParser(unittest.TestCase):
    """
    Test the command line parsing
    """
    # pylint: disable=too-many-public-methods
    def test_nooption(self):
        """ parse cmdline without argument"""
        try:
            cmdline = shlex.split("msrsync")
            parse_cmdline(cmdline)
        except SystemExit, err:
            self.assertEqual(err.code, EOPTION_PARSER)
            return

        self.fail("Should have raised a SystemExit exception")

    def test_justrsync(self):
        """ parse cmdline with only --rsync option"""
        try:
            cmdline = shlex.split("msrsync --rsync")
            parse_cmdline(cmdline)
        except SystemExit, err:
            self.assertEqual(err.code, EOPTION_PARSER)
            return

        self.fail("Should have raised a SystemExit exception")

    def test_badsize(self):
        """ parse cmdline with a bad size"""
        try:
            cmdline = shlex.split("msrsync -s abcde src dest")
            parse_cmdline(cmdline)
        except SystemExit, err:
            self.assertEqual(err.code, EOPTION_PARSER)
            return

        self.fail("Should have raised a SystemExit exception")

    def test_badsize2(self):
        """ parse cmdline with a bad size"""
        try:
            cmdline = shlex.split("msrsync -s abcde src dest")
            parse_cmdline(cmdline)
        except SystemExit, err:
            self.assertEqual(err.code, EOPTION_PARSER)
            return

        self.fail("Should have raised a SystemExit exception")


    def test_bad_filesnumber(self):
        """ parse cmdline with a bad size"""
        try:
            cmdline = shlex.split("msrsync -f abcde src dest")
            parse_cmdline(cmdline)
        except SystemExit, err:
            self.assertEqual(err.code, EOPTION_PARSER)
            return

        self.fail("Should have raised a SystemExit exception")

    def test_only_src(self):
        """ parse cmdline with only a source dir"""
        try:
            cmdline = shlex.split("msrsync src")
            parse_cmdline(cmdline)
        except SystemExit, err:
            self.assertEqual(err.code, EOPTION_PARSER)
            return

        self.fail("Should have raised a SystemExit exception")

    def test_src_dst(self):
        """ test a basic and valid command line """
        cmdline = shlex.split("msrsync src dst")
        opt, srcs, dst = parse_cmdline(cmdline)
        self.assertEqual(opt.rsync, DEFAULT_RSYNC_OPTIONS)
        self.assertEqual(srcs, ["src"])
        self.assertEqual(dst, "dst")

    def test_src_multiple_dst(self):
        """ test a command line with multiple sources """
        cmdline = shlex.split("msrsync src1 src2 dst")
        opt, srcs, dst = parse_cmdline(cmdline)
        self.assertEqual(opt.rsync, DEFAULT_RSYNC_OPTIONS)
        self.assertEqual(srcs, ["src1", "src2"])
        self.assertEqual(dst, "dst")

    def test_src_dst_rsync(self):
        """ test a basic and valid command line with rsync option """
        cmdline = shlex.split("""msrsync --rsync "--numeric-ids" src dst""")
        opt, srcs, dst = parse_cmdline(cmdline)
        self.assertEqual(opt.rsync, "--numeric-ids")
        self.assertEqual(srcs, ["src"])
        self.assertEqual(dst, "dst")

    def test_src_multiple_dst_rsync(self):
        """ test a command line with multiple sources """
        cmdline = shlex.split("""msrsync --rsync "--numeric-ids" src1 src2 dst""")
        opt, srcs, dst = parse_cmdline(cmdline)
        self.assertEqual(opt.rsync, "--numeric-ids")
        self.assertEqual(srcs, ["src1", "src2"])
        self.assertEqual(dst, "dst")

    def test_src_dest_empty_rsync(self):
        """ test a basic and valid command line, but with empty rsync option """
        try:
            cmdline = shlex.split("msrsync --rsync src dst")
            parse_cmdline(cmdline)
        except SystemExit, err:
            self.assertEqual(err.code, EOPTION_PARSER)
            return

        self.fail("Should have raised a SystemExit exception")

    def test_rsync_delete(self):
        """ command line with --rsync option that contains --delete """
        try:
            cmdline = shlex.split("""msrsync --rsync "--delete" dst""")
            parse_cmdline(cmdline)
        except SystemExit, err:
            self.assertEqual(err.code, EOPTION_PARSER)
            return

        self.fail("Should have raised a SystemExit exception")

    def test_rsync_delete2(self):
        """ command line with --rsync option that contains --delete """
        try:
            cmdline = shlex.split("""msrsync --rsync "-a --numeric-ids --delete" src dst""")
            parse_cmdline(cmdline)
        except SystemExit, err:
            self.assertEqual(err.code, EOPTION_PARSER)
            return

        self.fail("Should have raised a SystemExit exception")

    def test_rsync_delete3(self):
        """ command line with -r option that contains --delete """
        try:
            cmdline = shlex.split("""msrsync -r "-a --numeric-ids --delete" src dst""")
            parse_cmdline(cmdline)
        except SystemExit, err:
            self.assertEqual(err.code, EOPTION_PARSER)
            return

        self.fail("Should have raised a SystemExit exception")


class TestRsyncOptionsChecker(unittest.TestCase):
    """
    Test the rsync options checker
    """
    # pylint: disable=too-many-public-methods
    def test_rsync_wrong_options(self):
        """ test with wrong_options """
        try:
            rsync_options = "--this-is-fake"
            _check_rsync_options(rsync_options)
        except SystemExit, err:
            self.assertEqual(err.code, ERSYNC_OPTIONS_CHECK)
            return

        self.fail("Should have raised a SystemExit exception")


class TestSyncAPI(unittest.TestCase):
    """
    Test msrsync by directly calling python function
    It is redondant with TestSyncCLI but it makes coverage.py happy =)
    """

    # pylint: disable=too-many-public-methods
    def setUp(self):
        """ create a temporary fake tree """
        _check_executables()
        self.src = tempfile.mkdtemp(prefix='msrsync_testsync_')
        self.dst = tempfile.mkdtemp(prefix='msrsync_testsync_')
        _create_fake_tree(self.src, total_entries=1234, max_entries_per_level=123, max_depth=5, files_pct=95)

    def tearDown(self):
        """ remove the temporary fake tree """
        for path in self.src, self.dst:
            if os.path.exists(path):
                shutil.rmtree(path, onerror=rmtree_onerror)

    def _msrsync_test_helper(self, options=""):
        """ msrsync test helper """
        cmdline = """msrsync %s %s %s""" % (options, self.src, self.dst)
        main(shlex.split(cmdline))
        self.assert_(_compare_trees(self.src, self.dst), "The source %s and destination %s tree are not equal." % (self.src, self.dst))

    def test_simple_msrsync_api(self):
        """ test simple msrsync synchronisation """
        self._msrsync_test_helper()

    def test_msrsync_api_2_processes(self):
        """ test simple msrsync synchronisation """
        self._msrsync_test_helper(options='-p 2')

    def test_msrsync_api_4_processes(self):
        """ test simple msrsync synchronisation """
        self._msrsync_test_helper(options='-p 4')

    def test_msrsync_api_8_processes(self):
        """ test simple msrsync synchronisation """
        self._msrsync_test_helper(options='-p 8')


class TestSyncCLI(unittest.TestCase):
    """
    Test the synchronisation process using the commmand line interface
    """

    # pylint: disable=too-many-public-methods
    def setUp(self):
        """ create a temporary fake tree """
        _check_executables()
        self.src = tempfile.mkdtemp(prefix='msrsync_testsync_')
        self.dst = tempfile.mkdtemp(prefix='msrsync_testsync_')
        _create_fake_tree(self.src, total_entries=1234, max_entries_per_level=123, max_depth=5, files_pct=95)

    def tearDown(self):
        """ remove the temporary fake tree """
        for path in self.src, self.dst:
            if os.path.exists(path):
                shutil.rmtree(path, onerror=rmtree_onerror)

    def _msrsync_test_helper(self, options=""):
        """ msrsync test helper """
        cmd = "%s %s %s %s" % (os.path.realpath(__file__), options, self.src + os.sep, self.dst)
        ret, _, _, timeout, _ = run(cmd, timeout_sec=60)
        self.assert_(not timeout, "The msrsync command has timeouted.")
        self.assertEqual(ret, 0, "The msrsync command has failed.")
        self.assert_(_compare_trees(self.src, self.dst), "The source %s and destination %s tree are not equal." % (self.src, self.dst))


    def test_simple_rsync(self):
        """ test simple rsync synchronisation """
        cmd = "%s %s %s %s" % (RSYNC_EXE, DEFAULT_RSYNC_OPTIONS, self.src + os.sep, self.dst)
        ret, _, _, timeout, _ = run(cmd, timeout_sec=60)
        self.assert_(not timeout, "The rsync command has timeouted.")
        self.assertEqual(ret, 0, "The rsync command has failed.")
        self.assert_(_compare_trees(self.src, self.dst), "The source and destination tree are not equal. %s %s" % (self.src, self.dst))

    def test_simple_msrsync_cli(self):
        """ test simple msrsync synchronisation """
        self._msrsync_test_helper()

    def test_simple_msrsync_progress_cli(self):
        """ test simple msrsync synchronisation """
        self._msrsync_test_helper(options='--progress')

    def test_msrsync_progress_cli_2_processes(self):
        """ test simple msrsync synchronisation """
        self._msrsync_test_helper(options='--progress -p 2')

    def test_msrsync_cli_2_processes(self):
        """ test simple msrsync synchronisation """
        self._msrsync_test_helper(options='-p 2')

    def test_msrsync_cli_4_processes(self):
        """ test simple msrsync synchronisation """
        self._msrsync_test_helper(options='-p 4')

    def test_msrsync_cli_8_processes(self):
        """ test simple msrsync synchronisation """
        self._msrsync_test_helper(options='-p 8')


def selftest():
    """
    Embedded testing runner
    """
    suite = unittest.TestSuite()

    tests = [TestHelpers, \
             TestOptionsParser, \
             TestRsyncOptionsChecker, \
             TestSyncAPI, \
             TestSyncCLI]

    for test in tests:
        suite.addTest(unittest.TestLoader().loadTestsFromTestCase(test))

    unittest.TextTestRunner(verbosity=2).run(suite)


def _check_root(msg=None):
    """ Check if the caller is running under root """
    msg = "Need to be root" if not msg else msg
    if os.geteuid() != 0:
        print >>sys.stderr, "You're not root. Buffer cache will not be dropped between run. Take the result with caution."
    return True


def drop_caches(value=3):
    """ Drop caches using /proc/sys/vm/drop_caches """
    if os.geteuid() != 0:
        return
    drop_caches_path = "/proc/sys/vm/drop_caches"
    if os.path.exists(drop_caches_path):
        with open(drop_caches_path, "w") as proc_file:
            proc_file.write(str(value))
    else:
        print >>sys.stderr, "/proc/sys/vm/drop_caches does not exist. Cannot drop buffer cache"


def bench(total_entries=10000, max_entries_per_level=128, max_depth=5, files_pct=95, src=None, dst=None):
    """
    Embedded benchmark runner
    """
    # pylint: disable=too-many-arguments
    def _run_or_die(cmd):
        """ helper """
        ret, _, stderr, timeout, elapsed = run(cmd, timeout_sec=900)
        if ret == 666:
            sys.exit(EMSRSYNC_INTERRUPTED)
        if ret != 0 or timeout:
            print >>sys.stderr, "Problem running %s, aborting benchmark: %s" % (cmd, stderr)
            sys.exit(EBENCH)
        return elapsed

    def _run_msrsync_bench_and_print(options, src, dst, reference_result):
        """ helper """
        cmd = "%s %s %s %s" % (os.path.realpath(__file__), options, src, dst)
        msrsync_elapsed = _run_or_die(cmd)
        print >>sys.stdout, "msrsync %s took %.2f seconds (speedup x%.2f)" % (options, msrsync_elapsed, reference_result/msrsync_elapsed)


    _check_executables()
    cleanup_src = cleanup_dst = False
    try:
        if src is None:
            src = tempfile.mkdtemp()
            cleanup_src = True

        if dst is None:
            dst = tempfile.mkdtemp()
            cleanup_dst = True

        # to remove the directory between run
        dst_in_dst = tempfile.mkdtemp(dir=dst)

        _create_fake_tree(src, total_entries=total_entries, max_entries_per_level=max_entries_per_level, max_depth=max_depth, files_pct=files_pct)

        print >>sys.stdout, "Benchmarks with %d entries (%d%% of files):" % (total_entries, files_pct)

        shutil.rmtree(dst_in_dst, onerror=rmtree_onerror)
        drop_caches()

        cmd = "%s %s %s %s" % (RSYNC_EXE, DEFAULT_RSYNC_OPTIONS, src + os.sep, dst_in_dst)
        rsync_elapsed = _run_or_die(cmd)
        print >>sys.stdout, "rsync %s took %.2f seconds (speedup x1)" % (DEFAULT_RSYNC_OPTIONS, rsync_elapsed)

        shutil.rmtree(dst_in_dst, onerror=rmtree_onerror)
        drop_caches()

        _run_msrsync_bench_and_print('--processes 1 --files 1000 --size 1G', src + os.sep, dst_in_dst, rsync_elapsed)

        shutil.rmtree(dst_in_dst, onerror=rmtree_onerror)
        drop_caches()

        _run_msrsync_bench_and_print('--processes 2 --files 1000 --size 1G', src + os.sep, dst_in_dst, rsync_elapsed)

        shutil.rmtree(dst_in_dst, onerror=rmtree_onerror)
        drop_caches()

        _run_msrsync_bench_and_print('--processes 4 --files 1000 --size 1G', src + os.sep, dst_in_dst, rsync_elapsed)

        shutil.rmtree(dst_in_dst, onerror=rmtree_onerror)
        drop_caches()

        _run_msrsync_bench_and_print('--processes 8 --files 1000 --size 1G', src + os.sep, dst_in_dst, rsync_elapsed)

        shutil.rmtree(dst_in_dst, onerror=rmtree_onerror)
        drop_caches()

        _run_msrsync_bench_and_print('--processes 16 --files 1000 --size 1G', src + os.sep, dst_in_dst, rsync_elapsed)

    finally:
        if cleanup_src and os.path.exists(src):
            shutil.rmtree(src, onerror=rmtree_onerror)
        if cleanup_dst and os.path.exists(dst):
            shutil.rmtree(dst, onerror=rmtree_onerror)


def benchshm(total_entries=10000, max_entries_per_level=128, max_depth=5, files_pct=95):
    """
    Embedded benchmark runner
    """
    try:
        shm = os.getenv("SHM", "/dev/shm")
        src = tempfile.mkdtemp(dir=shm)
        dst = tempfile.mkdtemp(dir=shm)
    except OSError, err:
        print >>sys.stderr, "Error creating temporary bench directories in %s: %s" % (shm, err)
        sys.exit(EBENCH)

    try:
        bench(total_entries=total_entries, max_entries_per_level=max_entries_per_level, max_depth=max_depth, files_pct=files_pct, src=dst, dst=dst)
    finally:
        if os.path.exists(src):
            shutil.rmtree(src, onerror=rmtree_onerror)
        if os.path.exists(dst):
            shutil.rmtree(dst, onerror=rmtree_onerror)


def main(cmdline):
    """
    main
    """
    _check_python_version()
    options, srcs, dest = parse_cmdline(cmdline)

    if options.version:
        if USING_SCANDIR:
            print >>sys.stdout, "%s" % VERSION
        else:
            print >>sys.stdout, "%s (no scandir optimization. Use python 3.5+ or install the scandir module)" % VERSION
        sys.exit(0)

    if options.selftest:
        selftest()
        sys.exit(0)

    if options.bench and _check_root():
        bench(total_entries=100000)
        sys.exit(0)

    if options.benchshm and _check_root():
        benchshm(total_entries=100000)
        sys.exit(0)

    _check_executables()
    _check_srcs_dest(srcs, dest)
    _check_rsync_options(options.rsync)
    return msrsync(options, srcs, dest)


if __name__ == '__main__':
    try:
        sys.exit(main(sys.argv))
    except (KeyboardInterrupt, SystemExit):
        pass