So we've copied everything from the file now right?

Well, we've copied all the data, and depending on your application that might be enough, but files also have metadata to worry about.

We can see this by example, by checking the output of ls -l, before and after moving the file to a different filesystem.

$ touch /run/user/$(id -u)/testfile
$ ls -l /run/user/$(id -u)/testfile
-rw-rw-r-- 1 richardmaw richardmaw 0 Aug  8 19:40 /run/user/1000/testfile
$ ./my-mv /run/user/$(id -u)/testfile testfile
$ ls -l testfile
-rw------- 1 richardmaw richardmaw 0 Aug  8 19:41 testfile

You should be able to see that the -rw-rw-r-- mode string, which represents readable for everyone and writable for the user and group, has become -rw-------, which represents read and write for the user only.

This is because ls(1) uses stat(2), which is returning different data for the file.

Setting mode

stat(2) provided the mode of the file, in the st_mode field.

chmod(2) can be used set mode of the new file.

The result of stat(2) isn't exactly the same format as chmod(2) takes, since in the stat(2) field it includes bits saying what type of file it is, but chmod(2) can't change what type a file is, so is only interested in the portion of the mode that is the permission bits.

int copy_contents(int srcfd, int tgtfd) {
    int ret = -1;
    ret = btrfs_clone_contents(srcfd, tgtfd);
    if (ret >= 0)
        return ret;

    if (ret < 0 && errno != EINVAL) {
        /* Some error that wasn't from a btrfs clone,
           so we can't fall back to something that would work */
        perror("Copy file");
        return -1;
    }

    ret = sparse_copy_contents(srcfd, tgtfd);
    if (ret >= 0)
        return ret;

    if (ret < 0 && errno != EINVAL) {
        /* Some error that wasn't from a sparse copy,
           so we can't fall back to something that would work */
        perror("Copy file");
        return -1;
    }

    return naive_contents_copy(srcfd, tgtfd);
}


int copy_file(char *source, char *target, bool no_clobber) {
    int srcfd = -1;
    int tgtfd = -1;
    int ret = -1;
    struct stat source_stat;

    ret = open(source, O_RDONLY);
    if (ret == -1) {
        perror("Open source file");
        goto cleanup;
    }
    srcfd = ret;

    ret = open(target, O_WRONLY|O_CREAT|(no_clobber ? O_EXCL : 0), 0600);
    if (ret == -1) {
        perror("Open target file");
        goto cleanup;
    }
    tgtfd = ret;

    ret = copy_contents(srcfd, tgtfd);
    if (ret < 0)
        goto cleanup;

    ret = fstat(srcfd, &source_stat);
    if (ret < 0)
        goto cleanup;

    ret = fchmod(tgtfd, source_stat.st_mode);
    if (ret < 0)
        goto cleanup;
cleanup:
    close(srcfd);
    close(tgtfd);
    return ret;
}

User and Group

User and Group are numeric IDs that ls(1) looks up in /etc/passwd and /etc/group to turn into a human readable name.

The chown(1) and chgrp(1) take a name, but the chown(2) system call does both using the numeric ID.

The user and group can be found in the stat(2) result in the st_uid and st_gid fields.

setgid bits

If the setgid bit is set then newly created files have the group of the directory rather than the user that created them, but if files are moved in, then they have the group they had before.

Depending on your application, it may make more sense to inherit the group or to preserve it from the original file.

enum setgid {
    SETGID_AUTO,
    SETGID_NEVER,
    SETGID_ALWAYS,
};


static int fix_owner(char *target, struct stat *source_stat, enum setgid setgid, int tgtfd) {
    struct stat target_stat;
    struct stat dirname_stat;
    char *target_dirname;
    int ret = 0;

    if (setgid == SETGID_NEVER)
        return fchown(tgtfd, source_stat->st_uid, source_stat->st_gid);

    ret = fstat(tgtfd, &target_stat);
    if (ret < 0) {
        perror("Stat target file");
        return ret;
    }

    target_dirname = dirname(target);
    ret = stat(target_dirname, &dirname_stat);
    if (ret < 0) {
        perror("Stat target directory");
        return ret;
    }

    if ((setgid == SETGID_ALWAYS
         || (setgid == SETGID_AUTO && dirname_stat.st_gid & S_ISGID))
        && target_stat.st_gid != dirname_stat.st_gid) {
        ret = fchown(tgtfd, target_stat.st_uid, dirname_stat.st_gid);
        if (ret < 0)
            perror("Chown target");
    }

    return ret;
}

static int fix_rename_owner(char *target, struct stat *source_stat, enum setgid setgid) {
    int tgtfd = -1;
    int ret = -1;

    ret = open(target, O_RDWR);
    if (ret == -1) {
        perror("Open target file");
        goto cleanup;
    }
    tgtfd = ret;

    ret = fix_owner(target, source_stat, setgid, tgtfd);
cleanup:
    close(tgtfd);
    return ret;
}

int move_file(char *source, char *target, bool no_clobber, enum setgid setgid) {
    int ret;
    struct stat source_stat;
    bool have_source_stat = false;
    if (setgid == SETGID_NEVER) {
        ret = stat(source, &source_stat);
        if (ret < 0)
            return ret;
        have_source_stat = true;
    }

    ret = renameat2(AT_FDCWD, source, AT_FDCWD, target, no_clobber ? RENAME_NOREPLACE : 0);
    if (ret == 0)
        return fix_rename_owner(target, &source_stat, setgid);
    if (errno == EXDEV)
        goto xdev;
    if (errno != ENOSYS) {
        perror("rename2");
        return ret;
    }
    /* Have to skip to copy if unimplemented since rename can't detect EEXIST */
    if (no_clobber)
        goto xdev;
rename:
    ret = rename(source, target);
    if (ret == 0)
        return fix_rename_owner(target, &source_stat, setgid);
    if (errno == EXDEV)
        goto xdev;
    perror("rename");
    return ret;
xdev:
    if (!have_source_stat) {
        ret = stat(source, &source_stat);
        if (ret < 0)
            return ret;
    }

    ret = copy_file(source, target, &source_stat, no_clobber, setgid);
    if (ret != 0)
        return ret;
    ret = unlink(source);
    if (ret < 0)
        perror("unlink");
}

Modification time

mtime and atime are the "last modification time" and "last access time".

This has classically been set with the utimes(2) system call, but this does not support nanosecond precision, so the futimens(2) system call is used.

This takes a pair of struct timespecs, and the times from the stat(2) result can be retrieved in struct timespec format in the st_atim and st_mtim fields.

int copy_file(char *source, char *target, struct stat *source_stat, bool no_clobber, enum setgid setgid) {
    int srcfd = -1;
    int tgtfd = -1;
    int ret = -1;

    ret = open(source, O_RDONLY);
    if (ret == -1) {
        perror("Open source file");
        goto cleanup;
    }
    srcfd = ret;

    ret = open(target, O_WRONLY|O_CREAT|(no_clobber ? O_EXCL : 0), 0600);
    if (ret == -1) {
        perror("Open target file");
        goto cleanup;
    }
    tgtfd = ret;

    ret = copy_contents(srcfd, tgtfd);
    if (ret < 0)
        goto cleanup;

    ret = fchmod(tgtfd, source_stat->st_mode);
    if (ret < 0)
        goto cleanup;

    ret = fix_owner(target, source_stat, setgid, tgtfd);
    if (ret < 0)
        goto cleanup;

    {
        struct timespec times[] = { source_stat->st_atim, source_stat->st_mtim, };
        ret = futimens(tgtfd, times);
        if (ret < 0)
            goto cleanup;
    }
cleanup:
    close(srcfd);
    close(tgtfd);
    return ret;
}

For convenience of testing, the full my-mv.c source file and Makefile, including the new copy functions, can be downloaded.

Unfixable data

Link count

The stat data returns how many other directory entries point to the same file in the st_nlink field.

We could only copy this correctly by making the same number of links, but this is unlikely to matter, and can't be fixed, unless we're copying a whole directory tree.

Creation/change time

There's another time in the stat(2) result, ctime. This is an unchangeable last changed time. It can only be set to an approximate value, by changing the system clock and modifying the file.

This is not worth the effort, as it requires elevated privileges and can cause problems for other programs.

Device and inode

There's two other fields called st_dev and st_ino, which identify which filesystem and file on that filesystem the file is.

It doesn't tell you much, other than whether the file is the same as another, which can be used to detect whether you would accidentally trash a file if you were to copy the contents of one file into another, or in the case of tar(1), whether a file were replaced in between it being created and its metadata being updated.

st_dev on its own has also classically been used to determine whether two files are on the same filesystem, but btrfs can provide different st_dev values for file on the same filesystem, but in different subvolumes, and bind-mounts may have the same st_dev for logically different mounts.

The btrfs weirdness can be solved by using statfs(2) to determine whether the files are on btrfs, and using BTRFS_IOC_FS_INFO and BTRFS_IOC_DEV_INFO to find out which block device the filesystem was mounted from then stat(2) on the device node to find its st_dev.

The bind-mounts can be solved by getting the mount ID, either using name_to_handle_at(2), or opening the file and reading /proc/self/fdinfo/$fd to read the mnt_id field, and comparing the mnd_id of the two files.

So we've made stat as similar as we can, that's all the metadata right?

Not quite, there's some less common metadata to apply.