Copying extended attributes

Extended attributes allow you to provide extra metadata for a file.

It's effectively a key-value store using a string for the key, but values can be arbitrary blobs.

flistxattr(2) to know which xattrs exist. fgetxattr(2) to read any xattrs. fsetxattr(2) to write an xattr to a file.

Regular users can set xattrs beginning user., and as far as Linux is concerned that's arbitrary data that it doesn't need to care what they are.

However there are attributes outside the user. namespace which have special meaning to Linux, so we shouldn't try to copy everything as it is.

static int realloc_double(void **buf, size_t *size) {
    size_t new_size = *size * 2;
    void *new_buf = realloc(*buf, new_size);
    if (new_buf == NULL && new_size != 0)
        return -1;
    *buf = new_buf;
    *size = new_size;
    return 0;
}

static int xattr_list(int fd, char **names, size_t *size) {
    ssize_t ret;

    if (*names == NULL && *size == 0) {
        ret = TEMP_FAILURE_RETRY(flistxattr(fd, NULL, 0));
        if (ret < 0)
            goto error;
        *size = ret;

        *names = malloc(*size);
        if (*names == NULL) {
            ret = -1;
            goto error;
        }
    }

    for (;;) {
        ret = TEMP_FAILURE_RETRY(flistxattr(fd, *names, *size));
        if (ret >= 0) {
            *size = ret;
            break;
        }

        if (errno != ERANGE)
            goto error;

        /* New xattr added since first flistxattr */
        ret = realloc_double((void**)names, size);
        if (ret < 0)
            goto error;
    }

    ret = 0;
error:
    return ret;
}

static int xattr_get(int fd, const char *name, void **value, size_t *size) {
    ssize_t ret;

    if (*value == NULL && *size == 0) {
        ret = TEMP_FAILURE_RETRY(fgetxattr(fd, name, NULL, 0));
        if (ret < 0)
            goto error;
        *size = ret;

        *value = malloc(*size);
        if (*value == NULL) {
            ret = -1;
            goto error;
        }
    }

    for (;;) {
        ret = TEMP_FAILURE_RETRY(flistxattr(fd, *value, *size));
        if (ret >= 0) {
            *size = ret;
            break;
        }

        if (errno != ERANGE)
            goto error;

        /* xattr grew since first getxattr */
        ret = realloc_double(value, size);
        if (ret < 0)
            goto error;
    }

    ret = 0;
error:
    return ret;
}

static int str_starts_with(const char *s1, const char *s2) {
    return strncmp(s1, s2, strlen(s2)) == 0;
}

static int copy_xattrs(int srcfd, int tgtfd) {
    ssize_t ret;
    char *names = NULL;
    void *value = NULL;
    size_t names_size = 0, value_size = 0;

    ret = xattr_list(srcfd, &names, &names_size);
    if (ret < 0)
        goto cleanup;

    for (char *name = names; name < names + names_size;
         name = strchrnul(name, '\0') + 1) {
        /* Skip xattrs that need special handling */
        if (!str_starts_with(name, "user.")) {
            continue;
        }

        ret = xattr_get(srcfd, name, &value, &value_size);
        if (ret < 0)
            goto cleanup;

        ret = TEMP_FAILURE_RETRY(fsetxattr(tgtfd, name, value, value_size, 0));
        if (ret < 0)
            goto cleanup;
    }

cleanup:
    free(names);
    free(value);
    return ret;
}

POSIX ACLs

Feature from a POSIX design specification that wasn't widely adopted, but Linux supports the draft specification.

Not used much outside of NFS or SAMBA. You would be forgiven for not knowing they exist.

How they work is beyond the scope of this article, but man7.org covers some details about what it does and lwn.net covers some limitations.

For this article we're not concerned with how to use POSIX ACLs, it's relevant to us because they work by storing data in a special attribute, so if we want to preserve this we need to copy system.posix_acl_access.

Since this attribute doesn't start with "user." we need root privileges to copy it faithfully.

int copy_posix_acls(int srcfd, int tgtfd) {
    static const char name[] = "system.posix_acl_access";
    int ret = 0;
    void *value = NULL;
    size_t size = 0;

    ret = xattr_get(srcfd, name, &value, &size);
    if (ret < 0) {
        if (errno == ENODATA)
            ret = 0;
        goto cleanup;
    }

    ret = TEMP_FAILURE_RETRY(fsetxattr(tgtfd, name, value, size, 0));
    if (ret < 0) {
        goto cleanup;
    }

cleanup:
    free(value);
    return ret;
}

As with previous articles, the full version of the my-mv.c source file and the Makefile may be downloaded.

Surely that's the last of the metadata!

I mentioned some extended attributes have special meaning to Linux. POSIX ACLs can be sanely copied, but some attributes have tricky semantics that require special handling.

Posted Wed Oct 5 11:00:09 2016 Tags:

It was late in the afternoon. The city was in that quiet period between the lunch time rush hour and the end of work day rush hour. The sun was getting low in the sky. My office blinds were making shafts of light that highlighted the dust particles hanging in the air. I was bored, but looking forward to an early evening date with a bottle of soda and a video stream to my couch.

Suddenly, blam, a letter dropped into my INBOX. Oh, no! So much for my evening plans.

I took my feet off my desk, and opened the missive. It was a report of the worst kind: someone's found a bug, and I would have to get off my gluteus maximus to squash it, and squash it good. I opened my drawer, and took out the key to the hardware cabinet, and strapped a .33 under my left armpit. (That's a .33 liter can of cola, not a gun. Using guns to deal with bugs would be ridiculous, what kind of a thug did you think I am?)


So you have a free software project with users. Sooner or later someone will report a problem that they're experiencing, and this may turn out to be an actual bug in your program that you need to fix. Let's have a short look at how to do that well.

Some recommendations, based on over three decades of writing software:

  • You should document how to report issues. Making unhappy users dig for that information just makes them angry.

  • You should separate reports of issues separately from confirmed bugs. A bug is something you need to fix, an issue is something you need to investigate what causes it. An issue report is a discussion to find out what is wrong, and it may or may not turn into a confirmed bug that causes code changes.

  • Try to keep the list of open reports as short as possible, and also the list of confirmed, but unfixed bugs. Long lists are de-moralising, and also take effort to pick the next thing to pay attention to. This is all waste.

  • It's good to have a public tracker of issues and bugs. It can be as easy as a static web page you maintain manually, or it can be something automated, such as the Debian BTS, or Bugzilla, or any of the myriad other options.

  • Automated trackers tend to enforce some process. Some of them make this rather heavy, others keep it quite lightweight. For small projects, lightweight is a much better option. Only add a more heavyweight process after it's already clearly needed.

  • You should probably make it easy to report issues. Some people prefer to make it less easy in order to avoid too many issues being reported for trivial things, but my preference is to keep the threshold as low as possible.

  • Whatever you do, treat those who report issues with kindness. Be gracious. Be friendly. Be open to the possibility that you've made a mistake.

Posted Wed Oct 12 11:00:07 2016 Tags:

Dealing with semantically important xattrs

We previously spoke about extended attributes like they were just another piece of metadata attached to files.

However some have rather awkward interfaces as far as copying is concerned, some because they don't depend on the file's contents itself, and some because they are filesystem specific.

Selinux labels

Selinux is a complicated mandatory access control mechanism.

Rather than store the access control rules in the file, like POSIX ACLs do, the rules are stored elsewhere in the kernel and a reference to what kind of file it is, is stored in the file as an extended attribute called the "security label".

The security label and the security context of the process accessing the file are looked up in the ACL rules in the kernel to determine whether the operation is permitted.

The details of how to define Selinux rules is complicated and beyond the scope of this article. We only care how we should reapply the rules when moving the file.

While we could copy the label from the old file into the new file, as we did for POSIX ACLs, Selinux contexts are defined by their file paths rather than the inodes, so after we move a file we should relabel it to what the file should have in the new location.

Using selinux_restorecon(3) might be tempting, but it leaves open a race condition where the file would be created with the wrong context so it temporarily accessible with the wrong label.

If the file context should be preserved from the original file, then you must read the context from the extended attribute, either directly with fgetxattr(2) or fgetfilecon(3), and then set the context before creating the new file with setfscreatecon(3).

If instead it should have the label that the path database says it should be, then the required context can be found by using selabel_open(3) with SELABEL_CTX_FILE to get a reference to the file contexts database, then getting the label it should have at that path using selabel_lookup(3), and setting the context for new files with setfscreatecon(3).

Existing files can have their labels changed with selinux_restorecon(3).

The setfscreatecon(3) API is unfortunate as it involves global state. Recent enough versions of Linux have the O_TMPFILE flag for file creation, which doesn't create a directory entry for the file when it is created, so you can modify the file before it is visible to other processes, and can be bound into place with linkat(2).

int set_selinux_create_context(const char *tgt, mode_t srcmode) {
    int ret = 0;
    struct selabel_handle *hnd = NULL;
    char *context = NULL;

    hnd = selabel_open(SELABEL_CTX_FILE, NULL, 0);
    if (hnd == NULL) {
        if (errno != ENOENT) {
            ret = 1;
        }
        goto cleanup;
    }

    ret = selabel_lookup(hnd, &context, tgt, srcmode);
    if (ret != 0) {
        goto cleanup;
    }

    ret = setfscreatecon(context);

cleanup:
    freecon(context);
    if (hnd != NULL)
        selabel_close(hnd);
    return ret;
}

SMACK

This is another security technology.

Like Selinux it has labels. These are stored in extended attributes matching security.SMACK64*, so require root privileges to copy faithfully.

btrfs flags

Only worth copying if both source and destination are on btrfs, but if you then move a file back to btrfs you might want to restore them.

The only flag of real interest is "btrfs.compression", which is safe to ignore if moving to a file system which doesn't support it.

A "brain dead" implementation for this and SMACK is to check the prefix, and silently accept failure if setting the attribute fails.

static int copy_xattrs(int srcfd, int tgtfd) {
    ssize_t ret;
    char *names = NULL;
    void *value = NULL;
    size_t names_size = 0, value_size = 0;

    ret = xattr_list(srcfd, &names, &names_size);
    if (ret < 0)
        goto cleanup;

    for (char *name = names; name < names + names_size;
         name = strchrnul(name, '\0') + 1) {
        /* Skip xattrs that need special handling */
        if (!str_starts_with(name, "user.") &&
            !str_starts_with(name, "security.SMACK64") &&
            !str_starts_with(name, "btrfs.")) {
            continue;
        }

        ret = xattr_get(srcfd, name, &value, &value_size);
        if (ret < 0)
            goto cleanup;

        ret = TEMP_FAILURE_RETRY(fsetxattr(tgtfd, name, value, value_size, 0));
        if (ret < 0) {
            if (errno == EINVAL &&
                (str_starts_with(name, "security.SMACK64") ||
                 str_starts_with(name, "btrfs."))) {
                continue;
            }
            goto cleanup;
        }
    }

cleanup:
    free(names);
    free(value);
    return ret;
}

As with previous articles, the full version of the my-mv.c source file and the Makefile may be downloaded.

The Makefile has changed since earlier since it now needs to link against libselinux.

So now we've got an equivalent to a slow rename(2), right?

Not quite, rename(2) is atomic. It disappears from the old location and reappears whole at the new one at the same time.

Posted Wed Oct 19 11:00:06 2016 Tags:

I mentioned atomicity at the end of my previous article and had intended to write about how to make all the previous operations atomic, but doing so requires understanding how to atomically clobber files before we can atomically move them.

The naive approach would be to check whether the file existed before attempting the operation.

However this runs the risk of the file being added or removed between the check and the operation, in what's known as a TOCTOU attack.

We previously mentioned how to atomically create a file without clobbering, but we may instead explicitly want to clobber it, or not care.

For opening a file, this is just a matter of using different flags.

enum clobber {
    CLOBBER_PERMITTED     = 'p',
    CLOBBER_REQUIRED      = 'R',
    CLOBBER_FORBIDDEN     = 'N',
    CLOBBER_TRY_REQUIRED  = 'r',
    CLOBBER_TRY_FORBIDDEN = 'n',
};

int create_file(const char *path, mode_t mode, int flags,
                enum clobber clobber) {
    switch (clobber) {
        case CLOBBER_PERMITTED:
            flags |= O_CREAT;
            break;
        case CLOBBER_REQUIRED:
        case CLOBBER_TRY_REQUIRED:
            flags &= ~O_CREAT;
            break;
        case CLOBBER_FORBIDDEN:
        case CLOBBER_TRY_FORBIDDEN:
            flags |= O_CREAT|O_EXCL;
            break;
        default:
            assert(0);
    }
    return open(path, flags, mode);
}

For renaming a file things get a bit more awkward.

There are flags for changing how the rename behaves when the file exists, but there isn't one for requiring that it does so.

Instead there's RENAME_EXCHANGE which will fail if the target does not exist and the source file will replace the target on success, but it has the side effect of leaving the target file behind where the source file was.

This can be remedied by calling unlink(2).

int rename_file(const char *src, const char *tgt, enum clobber clobber) {
    int ret = -1;
    int renameflags = 0;

    switch (clobber) {
        case CLOBBER_REQUIRED:
        case CLOBBER_TRY_REQUIRED:
            renameflags = RENAME_EXCHANGE;
            break;
        case CLOBBER_FORBIDDEN:
        case CLOBBER_TRY_FORBIDDEN:
            renameflags = RENAME_NOREPLACE;
            break;
        default:
            assert(0);
    }

    ret = renameat2(AT_FDCWD, src, AT_FDCWD, tgt, renameflags);
    if (ret == 0) {
        if (clobber == CLOBBER_REQUIRED || clobber == CLOBBER_TRY_REQUIRED) {
            ret = unlink(src);
        }
        return ret;
    }

    if ((errno == ENOSYS || errno == EINVAL)
        && (clobber != CLOBBER_REQUIRED
            && clobber != CLOBBER_FORBIDDEN)) {
        ret = rename(src, tgt);
    }

cleanup:
    return ret;
}

A test program, clobbering.c and the accompanying Makefile may be downloaded.

This test program will rename if passed two file paths and copy standard input to a file if only passed one path.

Posted Wed Oct 26 11:00:12 2016 Tags: