// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/syserror" ) // Credentials returns t's credentials. // // This value must be considered immutable. func (t *Task) Credentials() *auth.Credentials { return t.creds.Load() } // UserNamespace returns the user namespace associated with the task. func (t *Task) UserNamespace() *auth.UserNamespace { return t.Credentials().UserNamespace } // HasCapabilityIn checks if the task has capability cp in user namespace ns. func (t *Task) HasCapabilityIn(cp linux.Capability, ns *auth.UserNamespace) bool { return t.Credentials().HasCapabilityIn(cp, ns) } // HasCapability checks if the task has capability cp in its user namespace. func (t *Task) HasCapability(cp linux.Capability) bool { return t.Credentials().HasCapability(cp) } // SetUID implements the semantics of setuid(2). func (t *Task) SetUID(uid auth.UID) error { // setuid considers -1 to be invalid. if !uid.Ok() { return syserror.EINVAL } t.mu.Lock() defer t.mu.Unlock() creds := t.Credentials() kuid := creds.UserNamespace.MapToKUID(uid) if !kuid.Ok() { return syserror.EINVAL } // "setuid() sets the effective user ID of the calling process. If the // effective UID of the caller is root (more precisely: if the caller has // the CAP_SETUID capability), the real UID and saved set-user-ID are also // set." - setuid(2) if creds.HasCapability(linux.CAP_SETUID) { t.setKUIDsUncheckedLocked(kuid, kuid, kuid) return nil } // "EPERM: The user is not privileged (Linux: does not have the CAP_SETUID // capability) and uid does not match the real UID or saved set-user-ID of // the calling process." if kuid != creds.RealKUID && kuid != creds.SavedKUID { return syserror.EPERM } t.setKUIDsUncheckedLocked(creds.RealKUID, kuid, creds.SavedKUID) return nil } // SetREUID implements the semantics of setreuid(2). func (t *Task) SetREUID(r, e auth.UID) error { t.mu.Lock() defer t.mu.Unlock() // "Supplying a value of -1 for either the real or effective user ID forces // the system to leave that ID unchanged." - setreuid(2) creds := t.Credentials() newR := creds.RealKUID if r.Ok() { newR = creds.UserNamespace.MapToKUID(r) if !newR.Ok() { return syserror.EINVAL } } newE := creds.EffectiveKUID if e.Ok() { newE = creds.UserNamespace.MapToKUID(e) if !newE.Ok() { return syserror.EINVAL } } if !creds.HasCapability(linux.CAP_SETUID) { // "Unprivileged processes may only set the effective user ID to the // real user ID, the effective user ID, or the saved set-user-ID." if newE != creds.RealKUID && newE != creds.EffectiveKUID && newE != creds.SavedKUID { return syserror.EPERM } // "Unprivileged users may only set the real user ID to the real user // ID or the effective user ID." if newR != creds.RealKUID && newR != creds.EffectiveKUID { return syserror.EPERM } } // "If the real user ID is set (i.e., ruid is not -1) or the effective user // ID is set to a value not equal to the previous real user ID, the saved // set-user-ID will be set to the new effective user ID." newS := creds.SavedKUID if r.Ok() || (e.Ok() && newE != creds.EffectiveKUID) { newS = newE } t.setKUIDsUncheckedLocked(newR, newE, newS) return nil } // SetRESUID implements the semantics of the setresuid(2) syscall. func (t *Task) SetRESUID(r, e, s auth.UID) error { t.mu.Lock() defer t.mu.Unlock() // "Unprivileged user processes may change the real UID, effective UID, and // saved set-user-ID, each to one of: the current real UID, the current // effective UID or the current saved set-user-ID. Privileged processes (on // Linux, those having the CAP_SETUID capability) may set the real UID, // effective UID, and saved set-user-ID to arbitrary values. If one of the // arguments equals -1, the corresponding value is not changed." - // setresuid(2) var err error creds := t.Credentials() newR := creds.RealKUID if r.Ok() { newR, err = creds.UseUID(r) if err != nil { return err } } newE := creds.EffectiveKUID if e.Ok() { newE, err = creds.UseUID(e) if err != nil { return err } } newS := creds.SavedKUID if s.Ok() { newS, err = creds.UseUID(s) if err != nil { return err } } t.setKUIDsUncheckedLocked(newR, newE, newS) return nil } // Preconditions: t.mu must be locked. func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) { creds := t.Credentials().Fork() // The credentials object is immutable. See doc for creds. root := creds.UserNamespace.MapToKUID(auth.RootUID) oldR, oldE, oldS := creds.RealKUID, creds.EffectiveKUID, creds.SavedKUID creds.RealKUID, creds.EffectiveKUID, creds.SavedKUID = newR, newE, newS // "1. If one or more of the real, effective or saved set user IDs was // previously 0, and as a result of the UID changes all of these IDs have a // nonzero value, then all capabilities are cleared from the permitted and // effective capability sets." - capabilities(7) if (oldR == root || oldE == root || oldS == root) && (newR != root && newE != root && newS != root) { // prctl(2): "PR_SET_KEEPCAP: Set the state of the calling thread's // "keep capabilities" flag, which determines whether the thread's permitted // capability set is cleared when a change is made to the // thread's user IDs such that the thread's real UID, effective // UID, and saved set-user-ID all become nonzero when at least // one of them previously had the value 0. By default, the // permitted capability set is cleared when such a change is // made; setting the "keep capabilities" flag prevents it from // being cleared." (A thread's effective capability set is always // cleared when such a credential change is made, // regardless of the setting of the "keep capabilities" flag.) if !creds.KeepCaps { creds.PermittedCaps = 0 creds.EffectiveCaps = 0 } } // """ // 2. If the effective user ID is changed from 0 to nonzero, then all // capabilities are cleared from the effective set. // // 3. If the effective user ID is changed from nonzero to 0, then the // permitted set is copied to the effective set. // """ if oldE == root && newE != root { creds.EffectiveCaps = 0 } else if oldE != root && newE == root { creds.EffectiveCaps = creds.PermittedCaps } // "4. If the filesystem user ID is changed from 0 to nonzero (see // setfsuid(2)), then the following capabilities are cleared from the // effective set: ..." // (filesystem UIDs aren't implemented, nor are any of the capabilities in // question) if oldE != newE { // "[dumpability] is reset to the current value contained in // the file /proc/sys/fs/suid_dumpable (which by default has // the value 0), in the following circumstances: The process's // effective user or group ID is changed." - prctl(2) // // (suid_dumpable isn't implemented, so we just use the // default. t.MemoryManager().SetDumpability(mm.NotDumpable) // Not documented, but compare Linux's kernel/cred.c:commit_creds(). t.parentDeathSignal = 0 } t.creds.Store(creds) } // SetGID implements the semantics of setgid(2). func (t *Task) SetGID(gid auth.GID) error { if !gid.Ok() { return syserror.EINVAL } t.mu.Lock() defer t.mu.Unlock() creds := t.Credentials() kgid := creds.UserNamespace.MapToKGID(gid) if !kgid.Ok() { return syserror.EINVAL } if creds.HasCapability(linux.CAP_SETGID) { t.setKGIDsUncheckedLocked(kgid, kgid, kgid) return nil } if kgid != creds.RealKGID && kgid != creds.SavedKGID { return syserror.EPERM } t.setKGIDsUncheckedLocked(creds.RealKGID, kgid, creds.SavedKGID) return nil } // SetREGID implements the semantics of setregid(2). func (t *Task) SetREGID(r, e auth.GID) error { t.mu.Lock() defer t.mu.Unlock() creds := t.Credentials() newR := creds.RealKGID if r.Ok() { newR = creds.UserNamespace.MapToKGID(r) if !newR.Ok() { return syserror.EINVAL } } newE := creds.EffectiveKGID if e.Ok() { newE = creds.UserNamespace.MapToKGID(e) if !newE.Ok() { return syserror.EINVAL } } if !creds.HasCapability(linux.CAP_SETGID) { if newE != creds.RealKGID && newE != creds.EffectiveKGID && newE != creds.SavedKGID { return syserror.EPERM } if newR != creds.RealKGID && newR != creds.EffectiveKGID { return syserror.EPERM } } newS := creds.SavedKGID if r.Ok() || (e.Ok() && newE != creds.EffectiveKGID) { newS = newE } t.setKGIDsUncheckedLocked(newR, newE, newS) return nil } // SetRESGID implements the semantics of the setresgid(2) syscall. func (t *Task) SetRESGID(r, e, s auth.GID) error { var err error t.mu.Lock() defer t.mu.Unlock() creds := t.Credentials() newR := creds.RealKGID if r.Ok() { newR, err = creds.UseGID(r) if err != nil { return err } } newE := creds.EffectiveKGID if e.Ok() { newE, err = creds.UseGID(e) if err != nil { return err } } newS := creds.SavedKGID if s.Ok() { newS, err = creds.UseGID(s) if err != nil { return err } } t.setKGIDsUncheckedLocked(newR, newE, newS) return nil } func (t *Task) setKGIDsUncheckedLocked(newR, newE, newS auth.KGID) { creds := t.Credentials().Fork() // The credentials object is immutable. See doc for creds. oldE := creds.EffectiveKGID creds.RealKGID, creds.EffectiveKGID, creds.SavedKGID = newR, newE, newS if oldE != newE { // "[dumpability] is reset to the current value contained in // the file /proc/sys/fs/suid_dumpable (which by default has // the value 0), in the following circumstances: The process's // effective user or group ID is changed." - prctl(2) // // (suid_dumpable isn't implemented, so we just use the // default. t.MemoryManager().SetDumpability(mm.NotDumpable) // Not documented, but compare Linux's // kernel/cred.c:commit_creds(). t.parentDeathSignal = 0 } t.creds.Store(creds) } // SetExtraGIDs attempts to change t's supplemental groups. All IDs are // interpreted as being in t's user namespace. func (t *Task) SetExtraGIDs(gids []auth.GID) error { t.mu.Lock() defer t.mu.Unlock() creds := t.Credentials() if !creds.HasCapability(linux.CAP_SETGID) { return syserror.EPERM } kgids := make([]auth.KGID, len(gids)) for i, gid := range gids { kgid := creds.UserNamespace.MapToKGID(gid) if !kgid.Ok() { return syserror.EINVAL } kgids[i] = kgid } creds = creds.Fork() // The credentials object is immutable. See doc for creds. creds.ExtraKGIDs = kgids t.creds.Store(creds) return nil } // SetCapabilitySets attempts to change t's permitted, inheritable, and // effective capability sets. func (t *Task) SetCapabilitySets(permitted, inheritable, effective auth.CapabilitySet) error { t.mu.Lock() defer t.mu.Unlock() // "Permitted: This is a limiting superset for the effective capabilities // that the thread may assume." - capabilities(7) if effective & ^permitted != 0 { return syserror.EPERM } creds := t.Credentials() // "It is also a limiting superset for the capabilities that may be added // to the inheritable set by a thread that does not have the CAP_SETPCAP // capability in its effective set." if !creds.HasCapability(linux.CAP_SETPCAP) && (inheritable & ^(creds.InheritableCaps|creds.PermittedCaps) != 0) { return syserror.EPERM } // "If a thread drops a capability from its permitted set, it can never // reacquire that capability (unless it execve(2)s ..." if permitted & ^creds.PermittedCaps != 0 { return syserror.EPERM } // "... if a capability is not in the bounding set, then a thread can't add // this capability to its inheritable set, even if it was in its permitted // capabilities ..." if inheritable & ^(creds.InheritableCaps|creds.BoundingCaps) != 0 { return syserror.EPERM } creds = creds.Fork() // The credentials object is immutable. See doc for creds. creds.PermittedCaps = permitted creds.InheritableCaps = inheritable creds.EffectiveCaps = effective t.creds.Store(creds) return nil } // DropBoundingCapability attempts to drop capability cp from t's capability // bounding set. func (t *Task) DropBoundingCapability(cp linux.Capability) error { t.mu.Lock() defer t.mu.Unlock() creds := t.Credentials() if !creds.HasCapability(linux.CAP_SETPCAP) { return syserror.EPERM } creds = creds.Fork() // The credentials object is immutable. See doc for creds. creds.BoundingCaps &^= auth.CapabilitySetOf(cp) t.creds.Store(creds) return nil } // SetUserNamespace attempts to move c into ns. func (t *Task) SetUserNamespace(ns *auth.UserNamespace) error { t.mu.Lock() defer t.mu.Unlock() creds := t.Credentials() // "A process reassociating itself with a user namespace must have the // CAP_SYS_ADMIN capability in the target user namespace." - setns(2) // // If t just created ns, then t.creds is guaranteed to have CAP_SYS_ADMIN // in ns (by rule 3 in auth.Credentials.HasCapability). if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns) { return syserror.EPERM } creds = creds.Fork() // The credentials object is immutable. See doc for creds. creds.UserNamespace = ns // "The child process created by clone(2) with the CLONE_NEWUSER flag // starts out with a complete set of capabilities in the new user // namespace. Likewise, a process that creates a new user namespace using // unshare(2) or joins an existing user namespace using setns(2) gains a // full set of capabilities in that namespace." creds.PermittedCaps = auth.AllCapabilities creds.InheritableCaps = 0 creds.EffectiveCaps = auth.AllCapabilities creds.BoundingCaps = auth.AllCapabilities // "A call to clone(2), unshare(2), or setns(2) using the CLONE_NEWUSER // flag sets the "securebits" flags (see capabilities(7)) to their default // values (all flags disabled) in the child (for clone(2)) or caller (for // unshare(2), or setns(2)." - user_namespaces(7) creds.KeepCaps = false t.creds.Store(creds) return nil } // SetKeepCaps will set the keep capabilities flag PR_SET_KEEPCAPS. func (t *Task) SetKeepCaps(k bool) { t.mu.Lock() defer t.mu.Unlock() creds := t.Credentials().Fork() // The credentials object is immutable. See doc for creds. creds.KeepCaps = k t.creds.Store(creds) } // updateCredsForExecLocked updates t.creds to reflect an execve(). // // NOTE(b/30815691): We currently do not implement privileged executables // (set-user/group-ID bits and file capabilities). This allows us to make a lot // of simplifying assumptions: // // - We assume the no_new_privs bit (set by prctl(SET_NO_NEW_PRIVS)), which // disables the features we don't support anyway, is always set. This // drastically simplifies this function. // // - We don't set AT_SECURE = 1, because no_new_privs always being set means // that the conditions that require AT_SECURE = 1 never arise. (Compare Linux's // security/commoncap.c:cap_bprm_set_creds() and cap_bprm_secureexec().) // // - We don't check for CAP_SYS_ADMIN in prctl(PR_SET_SECCOMP), since // seccomp-bpf is also allowed if the task has no_new_privs set. // // - Task.ptraceAttach does not serialize with execve as it does in Linux, // since no_new_privs being set has the same effect as the presence of an // unprivileged tracer. // // Preconditions: t.mu must be locked. func (t *Task) updateCredsForExecLocked() { // """ // During an execve(2), the kernel calculates the new capabilities of // the process using the following algorithm: // // P'(permitted) = (P(inheritable) & F(inheritable)) | // (F(permitted) & cap_bset) // // P'(effective) = F(effective) ? P'(permitted) : 0 // // P'(inheritable) = P(inheritable) [i.e., unchanged] // // where: // // P denotes the value of a thread capability set before the // execve(2) // // P' denotes the value of a thread capability set after the // execve(2) // // F denotes a file capability set // // cap_bset is the value of the capability bounding set // // ... // // In order to provide an all-powerful root using capability sets, during // an execve(2): // // 1. If a set-user-ID-root program is being executed, or the real user ID // of the process is 0 (root) then the file inheritable and permitted sets // are defined to be all ones (i.e. all capabilities enabled). // // 2. If a set-user-ID-root program is being executed, then the file // effective bit is defined to be one (enabled). // // The upshot of the above rules, combined with the capabilities // transformations described above, is that when a process execve(2)s a // set-user-ID-root program, or when a process with an effective UID of 0 // execve(2)s a program, it gains all capabilities in its permitted and // effective capability sets, except those masked out by the capability // bounding set. // """ - capabilities(7) // (ambient capability sets omitted) // // As the last paragraph implies, the case of "a set-user-ID root program // is being executed" also includes the case where (namespace) root is // executing a non-set-user-ID program; the actual check is just based on // the effective user ID. var newPermitted auth.CapabilitySet // since F(inheritable) == F(permitted) == 0 fileEffective := false creds := t.Credentials() root := creds.UserNamespace.MapToKUID(auth.RootUID) if creds.EffectiveKUID == root || creds.RealKUID == root { newPermitted = creds.InheritableCaps | creds.BoundingCaps if creds.EffectiveKUID == root { fileEffective = true } } creds = creds.Fork() // The credentials object is immutable. See doc for creds. // Now we enter poorly-documented, somewhat confusing territory. (The // accompanying comment in Linux's security/commoncap.c:cap_bprm_set_creds // is not very helpful.) My reading of it is: // // If at least one of the following is true: // // A1. The execing task is ptraced, and the tracer did not have // CAP_SYS_PTRACE in the execing task's user namespace at the time of // PTRACE_ATTACH. // // A2. The execing task shares its FS context with at least one task in // another thread group. // // A3. The execing task has no_new_privs set. // // AND at least one of the following is true: // // B1. The new effective user ID (which may come from set-user-ID, or be the // execing task's existing effective user ID) is not equal to the task's // real UID. // // B2. The new effective group ID (which may come from set-group-ID, or be // the execing task's existing effective group ID) is not equal to the // task's real GID. // // B3. The new permitted capability set contains capabilities not in the // task's permitted capability set. // // Then: // // C1. Limit the new permitted capability set to the task's permitted // capability set. // // C2. If either the task does not have CAP_SETUID in its user namespace, or // the task has no_new_privs set, force the new effective UID and GID to // the task's real UID and GID. // // But since no_new_privs is always set (A3 is always true), this becomes // much simpler. If B1 and B2 are false, C2 is a no-op. If B3 is false, C1 // is a no-op. So we can just do C1 and C2 unconditionally. if creds.EffectiveKUID != creds.RealKUID || creds.EffectiveKGID != creds.RealKGID { creds.EffectiveKUID = creds.RealKUID creds.EffectiveKGID = creds.RealKGID t.parentDeathSignal = 0 } // (Saved set-user-ID is always set to the new effective user ID, and saved // set-group-ID is always set to the new effective group ID, regardless of // the above.) creds.SavedKUID = creds.RealKUID creds.SavedKGID = creds.RealKGID creds.PermittedCaps &= newPermitted if fileEffective { creds.EffectiveCaps = creds.PermittedCaps } else { creds.EffectiveCaps = 0 } // prctl(2): The "keep capabilities" value will be reset to 0 on subsequent // calls to execve(2). creds.KeepCaps = false // "The bounding set is inherited at fork(2) from the thread's parent, and // is preserved across an execve(2)". So we're done. t.creds.Store(creds) }