xref: /unit/src/nxt_isolation.c (revision 1673:883f2f79c2f6)
1 /*
2  * Copyright (C) NGINX, Inc.
3  */
4 
5 #include <nxt_main.h>
6 #include <nxt_application.h>
7 #include <nxt_process.h>
8 #include <nxt_isolation.h>
9 
10 #if (NXT_HAVE_PIVOT_ROOT)
11 #include <mntent.h>
12 #endif
13 
14 
15 static nxt_int_t nxt_isolation_set(nxt_task_t *task,
16     nxt_conf_value_t *isolation, nxt_process_t *process);
17 
18 #if (NXT_HAVE_CLONE)
19 static nxt_int_t nxt_isolation_set_namespaces(nxt_task_t *task,
20     nxt_conf_value_t *isolation, nxt_process_t *process);
21 static nxt_int_t nxt_isolation_clone_flags(nxt_task_t *task,
22     nxt_conf_value_t *namespaces, nxt_clone_t *clone);
23 #endif
24 
25 #if (NXT_HAVE_CLONE_NEWUSER)
26 static nxt_int_t nxt_isolation_set_creds(nxt_task_t *task,
27     nxt_conf_value_t *isolation, nxt_process_t *process);
28 static nxt_int_t nxt_isolation_credential_map(nxt_task_t *task,
29     nxt_mp_t *mem_pool, nxt_conf_value_t *map_array,
30     nxt_clone_credential_map_t *map);
31 static nxt_int_t nxt_isolation_vldt_creds(nxt_task_t *task,
32     nxt_process_t *process);
33 #endif
34 
35 #if (NXT_HAVE_ISOLATION_ROOTFS)
36 static nxt_int_t nxt_isolation_set_rootfs(nxt_task_t *task,
37     nxt_conf_value_t *isolation, nxt_process_t *process);
38 static nxt_int_t nxt_isolation_set_automount(nxt_task_t *task,
39     nxt_conf_value_t *isolation, nxt_process_t *process);
40 static nxt_int_t nxt_isolation_set_mounts(nxt_task_t *task,
41     nxt_process_t *process, nxt_str_t *app_type);
42 static nxt_int_t nxt_isolation_set_lang_mounts(nxt_task_t *task,
43     nxt_process_t *process, nxt_array_t *syspaths);
44 static int nxt_cdecl nxt_isolation_mount_compare(const void *v1,
45     const void *v2);
46 static void nxt_isolation_unmount_all(nxt_task_t *task, nxt_process_t *process);
47 
48 #if (NXT_HAVE_PIVOT_ROOT) && (NXT_HAVE_CLONE_NEWNS)
49 static nxt_int_t nxt_isolation_pivot_root(nxt_task_t *task, const char *rootfs);
50 static nxt_int_t nxt_isolation_make_private_mount(nxt_task_t *task,
51     const char *rootfs);
52 nxt_inline int nxt_pivot_root(const char *new_root, const char *old_root);
53 #endif
54 
55 static nxt_int_t nxt_isolation_chroot(nxt_task_t *task, const char *path);
56 #endif
57 
58 #if (NXT_HAVE_PR_SET_NO_NEW_PRIVS)
59 static nxt_int_t nxt_isolation_set_new_privs(nxt_task_t *task,
60     nxt_conf_value_t *isolation, nxt_process_t *process);
61 #endif
62 
63 
64 nxt_int_t
65 nxt_isolation_main_prefork(nxt_task_t *task, nxt_process_t *process,
66     nxt_mp_t *mp)
67 {
68     nxt_int_t              cap_setid;
69     nxt_int_t              ret;
70     nxt_runtime_t          *rt;
71     nxt_common_app_conf_t  *app_conf;
72 
73     rt = task->thread->runtime;
74     app_conf = process->data.app;
75     cap_setid = rt->capabilities.setid;
76 
77     if (app_conf->isolation != NULL) {
78         ret = nxt_isolation_set(task, app_conf->isolation, process);
79         if (nxt_slow_path(ret != NXT_OK)) {
80             return ret;
81         }
82     }
83 
84 #if (NXT_HAVE_CLONE_NEWUSER)
85     if (nxt_is_clone_flag_set(process->isolation.clone.flags, NEWUSER)) {
86         cap_setid = 1;
87     }
88 #endif
89 
90     if (cap_setid) {
91         ret = nxt_process_creds_set(task, process, &app_conf->user,
92                                     &app_conf->group);
93 
94         if (nxt_slow_path(ret != NXT_OK)) {
95             return ret;
96         }
97 
98     } else {
99         if (!nxt_str_eq(&app_conf->user, (u_char *) rt->user_cred.user,
100                         nxt_strlen(rt->user_cred.user)))
101         {
102             nxt_alert(task, "cannot set user \"%V\" for app \"%V\": "
103                       "missing capabilities", &app_conf->user, &app_conf->name);
104 
105             return NXT_ERROR;
106         }
107 
108         if (app_conf->group.length > 0
109             && !nxt_str_eq(&app_conf->group, (u_char *) rt->group,
110                            nxt_strlen(rt->group)))
111         {
112             nxt_alert(task, "cannot set group \"%V\" for app \"%V\": "
113                             "missing capabilities", &app_conf->group,
114                             &app_conf->name);
115 
116             return NXT_ERROR;
117         }
118     }
119 
120 #if (NXT_HAVE_ISOLATION_ROOTFS)
121     if (process->isolation.rootfs != NULL) {
122         nxt_int_t  has_mnt;
123 
124         ret = nxt_isolation_set_mounts(task, process, &app_conf->type);
125         if (nxt_slow_path(ret != NXT_OK)) {
126             return ret;
127         }
128 
129         has_mnt = 0;
130 
131 #if (NXT_HAVE_CLONE_NEWNS)
132         has_mnt = nxt_is_clone_flag_set(process->isolation.clone.flags, NEWNS);
133 #endif
134 
135         if (process->user_cred->uid == 0 && !has_mnt) {
136             nxt_log(task, NXT_LOG_WARN,
137                     "setting user \"root\" with \"rootfs\" is unsafe without "
138                     "\"mount\" namespace isolation");
139         }
140     }
141 #endif
142 
143 #if (NXT_HAVE_CLONE_NEWUSER)
144     ret = nxt_isolation_vldt_creds(task, process);
145     if (nxt_slow_path(ret != NXT_OK)) {
146         return ret;
147     }
148 #endif
149 
150     return NXT_OK;
151 }
152 
153 
154 static nxt_int_t
155 nxt_isolation_set(nxt_task_t *task, nxt_conf_value_t *isolation,
156     nxt_process_t *process)
157 {
158 #if (NXT_HAVE_CLONE)
159     if (nxt_slow_path(nxt_isolation_set_namespaces(task, isolation, process)
160                       != NXT_OK))
161     {
162         return NXT_ERROR;
163     }
164 #endif
165 
166 #if (NXT_HAVE_CLONE_NEWUSER)
167     if (nxt_slow_path(nxt_isolation_set_creds(task, isolation, process)
168                       != NXT_OK))
169     {
170         return NXT_ERROR;
171     }
172 #endif
173 
174 #if (NXT_HAVE_ISOLATION_ROOTFS)
175     if (nxt_slow_path(nxt_isolation_set_rootfs(task, isolation, process)
176                       != NXT_OK))
177     {
178         return NXT_ERROR;
179     }
180 
181     if (nxt_slow_path(nxt_isolation_set_automount(task, isolation, process)
182                       != NXT_OK))
183     {
184         return NXT_ERROR;
185     }
186 #endif
187 
188 #if (NXT_HAVE_PR_SET_NO_NEW_PRIVS)
189     if (nxt_slow_path(nxt_isolation_set_new_privs(task, isolation, process)
190                       != NXT_OK))
191     {
192         return NXT_ERROR;
193     }
194 #endif
195 
196     return NXT_OK;
197 }
198 
199 
200 #if (NXT_HAVE_CLONE)
201 
202 static nxt_int_t
203 nxt_isolation_set_namespaces(nxt_task_t *task, nxt_conf_value_t *isolation,
204     nxt_process_t *process)
205 {
206     nxt_int_t         ret;
207     nxt_conf_value_t  *obj;
208 
209     static nxt_str_t  nsname = nxt_string("namespaces");
210 
211     obj = nxt_conf_get_object_member(isolation, &nsname, NULL);
212     if (obj != NULL) {
213         ret = nxt_isolation_clone_flags(task, obj, &process->isolation.clone);
214         if (nxt_slow_path(ret != NXT_OK)) {
215             return NXT_ERROR;
216         }
217     }
218 
219     return NXT_OK;
220 }
221 
222 #endif
223 
224 
225 #if (NXT_HAVE_CLONE_NEWUSER)
226 
227 static nxt_int_t
228 nxt_isolation_set_creds(nxt_task_t *task, nxt_conf_value_t *isolation,
229     nxt_process_t *process)
230 {
231     nxt_int_t         ret;
232     nxt_clone_t       *clone;
233     nxt_conf_value_t  *array;
234 
235     static nxt_str_t uidname = nxt_string("uidmap");
236     static nxt_str_t gidname = nxt_string("gidmap");
237 
238     clone = &process->isolation.clone;
239 
240     array = nxt_conf_get_object_member(isolation, &uidname, NULL);
241     if (array != NULL) {
242         ret = nxt_isolation_credential_map(task, process->mem_pool, array,
243                                            &clone->uidmap);
244 
245         if (nxt_slow_path(ret != NXT_OK)) {
246             return NXT_ERROR;
247         }
248     }
249 
250     array = nxt_conf_get_object_member(isolation, &gidname, NULL);
251     if (array != NULL) {
252         ret = nxt_isolation_credential_map(task, process->mem_pool, array,
253                                            &clone->gidmap);
254 
255         if (nxt_slow_path(ret != NXT_OK)) {
256             return NXT_ERROR;
257         }
258     }
259 
260     return NXT_OK;
261 }
262 
263 
264 static nxt_int_t
265 nxt_isolation_credential_map(nxt_task_t *task, nxt_mp_t *mp,
266     nxt_conf_value_t *map_array, nxt_clone_credential_map_t *map)
267 {
268     nxt_int_t         ret;
269     nxt_uint_t        i;
270     nxt_conf_value_t  *obj;
271 
272     static nxt_conf_map_t  nxt_clone_map_entry_conf[] = {
273         {
274             nxt_string("container"),
275             NXT_CONF_MAP_INT,
276             offsetof(nxt_clone_map_entry_t, container),
277         },
278 
279         {
280             nxt_string("host"),
281             NXT_CONF_MAP_INT,
282             offsetof(nxt_clone_map_entry_t, host),
283         },
284 
285         {
286             nxt_string("size"),
287             NXT_CONF_MAP_INT,
288             offsetof(nxt_clone_map_entry_t, size),
289         },
290     };
291 
292     map->size = nxt_conf_array_elements_count(map_array);
293 
294     if (map->size == 0) {
295         return NXT_OK;
296     }
297 
298     map->map = nxt_mp_alloc(mp, map->size * sizeof(nxt_clone_map_entry_t));
299     if (nxt_slow_path(map->map == NULL)) {
300         return NXT_ERROR;
301     }
302 
303     for (i = 0; i < map->size; i++) {
304         obj = nxt_conf_get_array_element(map_array, i);
305 
306         ret = nxt_conf_map_object(mp, obj, nxt_clone_map_entry_conf,
307                                   nxt_nitems(nxt_clone_map_entry_conf),
308                                   map->map + i);
309         if (nxt_slow_path(ret != NXT_OK)) {
310             nxt_alert(task, "clone map entry map error");
311             return NXT_ERROR;
312         }
313     }
314 
315     return NXT_OK;
316 }
317 
318 
319 static nxt_int_t
320 nxt_isolation_vldt_creds(nxt_task_t *task, nxt_process_t *process)
321 {
322     nxt_int_t         ret;
323     nxt_clone_t       *clone;
324     nxt_credential_t  *creds;
325 
326     clone = &process->isolation.clone;
327     creds = process->user_cred;
328 
329     if (clone->uidmap.size == 0 && clone->gidmap.size == 0) {
330         return NXT_OK;
331     }
332 
333     if (!nxt_is_clone_flag_set(clone->flags, NEWUSER)) {
334         if (nxt_slow_path(clone->uidmap.size > 0)) {
335             nxt_log(task, NXT_LOG_ERR, "\"uidmap\" is set but "
336                     "\"isolation.namespaces.credential\" is false or unset");
337 
338             return NXT_ERROR;
339         }
340 
341         if (nxt_slow_path(clone->gidmap.size > 0)) {
342             nxt_log(task, NXT_LOG_ERR, "\"gidmap\" is set but "
343                     "\"isolation.namespaces.credential\" is false or unset");
344 
345             return NXT_ERROR;
346         }
347 
348         return NXT_OK;
349     }
350 
351     ret = nxt_clone_vldt_credential_uidmap(task, &clone->uidmap, creds);
352     if (nxt_slow_path(ret != NXT_OK)) {
353         return NXT_ERROR;
354     }
355 
356     return nxt_clone_vldt_credential_gidmap(task, &clone->gidmap, creds);
357 }
358 
359 #endif
360 
361 
362 #if (NXT_HAVE_CLONE)
363 
364 static nxt_int_t
365 nxt_isolation_clone_flags(nxt_task_t *task, nxt_conf_value_t *namespaces,
366     nxt_clone_t *clone)
367 {
368     uint32_t          index;
369     nxt_str_t         name;
370     nxt_int_t         flag;
371     nxt_conf_value_t  *value;
372 
373     index = 0;
374 
375     for ( ;; ) {
376         value = nxt_conf_next_object_member(namespaces, &name, &index);
377 
378         if (value == NULL) {
379             break;
380         }
381 
382         flag = 0;
383 
384 #if (NXT_HAVE_CLONE_NEWUSER)
385         if (nxt_str_eq(&name, "credential", 10)) {
386             flag = CLONE_NEWUSER;
387         }
388 #endif
389 
390 #if (NXT_HAVE_CLONE_NEWPID)
391         if (nxt_str_eq(&name, "pid", 3)) {
392             flag = CLONE_NEWPID;
393         }
394 #endif
395 
396 #if (NXT_HAVE_CLONE_NEWNET)
397         if (nxt_str_eq(&name, "network", 7)) {
398             flag = CLONE_NEWNET;
399         }
400 #endif
401 
402 #if (NXT_HAVE_CLONE_NEWUTS)
403         if (nxt_str_eq(&name, "uname", 5)) {
404             flag = CLONE_NEWUTS;
405         }
406 #endif
407 
408 #if (NXT_HAVE_CLONE_NEWNS)
409         if (nxt_str_eq(&name, "mount", 5)) {
410             flag = CLONE_NEWNS;
411         }
412 #endif
413 
414 #if (NXT_HAVE_CLONE_NEWCGROUP)
415         if (nxt_str_eq(&name, "cgroup", 6)) {
416             flag = CLONE_NEWCGROUP;
417         }
418 #endif
419 
420         if (!flag) {
421             nxt_alert(task, "unknown namespace flag: \"%V\"", &name);
422             return NXT_ERROR;
423         }
424 
425         if (nxt_conf_get_boolean(value)) {
426             clone->flags |= flag;
427         }
428     }
429 
430     return NXT_OK;
431 }
432 
433 #endif
434 
435 
436 #if (NXT_HAVE_ISOLATION_ROOTFS)
437 
438 static nxt_int_t
439 nxt_isolation_set_rootfs(nxt_task_t *task, nxt_conf_value_t *isolation,
440     nxt_process_t *process)
441 {
442     nxt_str_t         str;
443     nxt_conf_value_t  *obj;
444 
445     static nxt_str_t  rootfs_name = nxt_string("rootfs");
446 
447     obj = nxt_conf_get_object_member(isolation, &rootfs_name, NULL);
448     if (obj != NULL) {
449         nxt_conf_get_string(obj, &str);
450 
451         if (nxt_slow_path(str.length <= 1 || str.start[0] != '/')) {
452             nxt_log(task, NXT_LOG_ERR, "rootfs requires an absolute path other "
453                     "than \"/\" but given \"%V\"", &str);
454 
455             return NXT_ERROR;
456         }
457 
458         if (str.start[str.length - 1] == '/') {
459             str.length--;
460         }
461 
462         process->isolation.rootfs = nxt_mp_alloc(process->mem_pool,
463                                                  str.length + 1);
464 
465         if (nxt_slow_path(process->isolation.rootfs == NULL)) {
466             return NXT_ERROR;
467         }
468 
469         nxt_memcpy(process->isolation.rootfs, str.start, str.length);
470 
471         process->isolation.rootfs[str.length] = '\0';
472     }
473 
474     return NXT_OK;
475 }
476 
477 
478 static nxt_int_t
479 nxt_isolation_set_automount(nxt_task_t *task, nxt_conf_value_t *isolation,
480     nxt_process_t *process)
481 {
482     nxt_conf_value_t         *conf, *value;
483     nxt_process_automount_t  *automount;
484 
485     static nxt_str_t  automount_name = nxt_string("automount");
486     static nxt_str_t  langdeps_name = nxt_string("language_deps");
487 
488     automount = &process->isolation.automount;
489 
490     automount->language_deps = 1;
491 
492     conf = nxt_conf_get_object_member(isolation, &automount_name, NULL);
493     if (conf != NULL) {
494         value = nxt_conf_get_object_member(conf, &langdeps_name, NULL);
495         if (value != NULL) {
496             automount->language_deps = nxt_conf_get_boolean(value);
497         }
498     }
499 
500     return NXT_OK;
501 }
502 
503 
504 static nxt_int_t
505 nxt_isolation_set_mounts(nxt_task_t *task, nxt_process_t *process,
506     nxt_str_t *app_type)
507 {
508     nxt_int_t              ret, cap_chroot;
509     nxt_runtime_t          *rt;
510     nxt_app_lang_module_t  *lang;
511 
512     rt = task->thread->runtime;
513     cap_chroot = rt->capabilities.chroot;
514     lang = nxt_app_lang_module(rt, app_type);
515 
516     nxt_assert(lang != NULL);
517 
518 #if (NXT_HAVE_CLONE_NEWUSER)
519     if (nxt_is_clone_flag_set(process->isolation.clone.flags, NEWUSER)) {
520         cap_chroot = 1;
521     }
522 #endif
523 
524     if (!cap_chroot) {
525         nxt_log(task, NXT_LOG_ERR, "The \"rootfs\" field requires privileges");
526         return NXT_ERROR;
527     }
528 
529     ret = nxt_isolation_set_lang_mounts(task, process, lang->mounts);
530     if (nxt_slow_path(ret != NXT_OK)) {
531         return NXT_ERROR;
532     }
533 
534     process->isolation.cleanup = nxt_isolation_unmount_all;
535 
536     return NXT_OK;
537 }
538 
539 
540 static nxt_int_t
541 nxt_isolation_set_lang_mounts(nxt_task_t *task, nxt_process_t *process,
542     nxt_array_t *lang_mounts)
543 {
544     u_char          *p;
545     size_t          i, n, rootfs_len, len;
546     nxt_mp_t        *mp;
547     nxt_array_t     *mounts;
548     const u_char    *rootfs;
549     nxt_fs_mount_t  *mnt, *lang_mnt;
550 
551     mp = process->mem_pool;
552 
553     /* copy to init mem pool */
554     mounts = nxt_array_copy(mp, NULL, lang_mounts);
555     if (mounts == NULL) {
556         return NXT_ERROR;
557     }
558 
559     n = mounts->nelts;
560     mnt = mounts->elts;
561     lang_mnt = lang_mounts->elts;
562 
563     rootfs = process->isolation.rootfs;
564     rootfs_len = nxt_strlen(rootfs);
565 
566     for (i = 0; i < n; i++) {
567         len = nxt_strlen(lang_mnt[i].dst);
568 
569         mnt[i].dst = nxt_mp_alloc(mp, rootfs_len + len + 1);
570         if (nxt_slow_path(mnt[i].dst == NULL)) {
571             return NXT_ERROR;
572         }
573 
574         p = nxt_cpymem(mnt[i].dst, rootfs, rootfs_len);
575         p = nxt_cpymem(p, lang_mnt[i].dst, len);
576         *p = '\0';
577     }
578 
579     mnt = nxt_array_add(mounts);
580     if (nxt_slow_path(mnt == NULL)) {
581         return NXT_ERROR;
582     }
583 
584     mnt->src = (u_char *) "tmpfs";
585     mnt->name = (u_char *) "tmpfs";
586     mnt->type = NXT_FS_TMP;
587     mnt->flags = (NXT_FS_FLAGS_NOSUID | NXT_FS_FLAGS_NODEV
588                   | NXT_FS_FLAGS_NOEXEC);
589     mnt->data = (u_char *) "size=1m,mode=777";
590     mnt->builtin = 1;
591     mnt->deps = 0;
592 
593     mnt->dst = nxt_mp_nget(mp, rootfs_len + nxt_length("/tmp") + 1);
594     if (nxt_slow_path(mnt->dst == NULL)) {
595         return NXT_ERROR;
596     }
597 
598     p = nxt_cpymem(mnt->dst, rootfs, rootfs_len);
599     p = nxt_cpymem(p, "/tmp", 4);
600     *p = '\0';
601 
602     mnt = nxt_array_add(mounts);
603     if (nxt_slow_path(mnt == NULL)) {
604         return NXT_ERROR;
605     }
606 
607     mnt->name = (u_char *) "proc";
608     mnt->type = NXT_FS_PROC;
609     mnt->src = (u_char *) "none";
610     mnt->dst = nxt_mp_nget(mp, rootfs_len + nxt_length("/proc") + 1);
611     if (nxt_slow_path(mnt->dst == NULL)) {
612         return NXT_ERROR;
613     }
614 
615     p = nxt_cpymem(mnt->dst, rootfs, rootfs_len);
616     p = nxt_cpymem(p, "/proc", 5);
617     *p = '\0';
618 
619     mnt->data = (u_char *) "";
620     mnt->flags = NXT_FS_FLAGS_NOEXEC | NXT_FS_FLAGS_NOSUID;
621     mnt->builtin = 1;
622     mnt->deps = 0;
623 
624     qsort(mounts->elts, mounts->nelts, sizeof(nxt_fs_mount_t),
625           nxt_isolation_mount_compare);
626 
627     process->isolation.mounts = mounts;
628 
629     return NXT_OK;
630 }
631 
632 
633 static int nxt_cdecl
634 nxt_isolation_mount_compare(const void *v1, const void *v2)
635 {
636     const nxt_fs_mount_t  *mnt1, *mnt2;
637 
638     mnt1 = v1;
639     mnt2 = v2;
640 
641     return nxt_strlen(mnt1->src) > nxt_strlen(mnt2->src);
642 }
643 
644 
645 void
646 nxt_isolation_unmount_all(nxt_task_t *task, nxt_process_t *process)
647 {
648     size_t                   n;
649     nxt_array_t              *mounts;
650     nxt_runtime_t            *rt;
651     nxt_fs_mount_t           *mnt;
652     nxt_process_automount_t  *automount;
653 
654     rt = task->thread->runtime;
655 
656     if (!rt->capabilities.setid) {
657         return;
658     }
659 
660 #if (NXT_HAVE_CLONE_NEWNS)
661     if (nxt_is_clone_flag_set(process->isolation.clone.flags, NEWNS)) {
662         return;
663     }
664 #endif
665 
666     nxt_debug(task, "unmount all (%s)", process->name);
667 
668     automount = &process->isolation.automount;
669     mounts = process->isolation.mounts;
670     n = mounts->nelts;
671     mnt = mounts->elts;
672 
673     while (n > 0) {
674         n--;
675 
676         if (mnt[n].deps && !automount->language_deps) {
677             continue;
678         }
679 
680         nxt_fs_unmount(mnt[n].dst);
681     }
682 }
683 
684 
685 nxt_int_t
686 nxt_isolation_prepare_rootfs(nxt_task_t *task, nxt_process_t *process)
687 {
688     size_t                   i, n;
689     nxt_int_t                ret;
690     struct stat              st;
691     nxt_array_t              *mounts;
692     const u_char             *dst;
693     nxt_fs_mount_t           *mnt;
694     nxt_process_automount_t  *automount;
695 
696     automount = &process->isolation.automount;
697     mounts = process->isolation.mounts;
698 
699     n = mounts->nelts;
700     mnt = mounts->elts;
701 
702     for (i = 0; i < n; i++) {
703         dst = mnt[i].dst;
704 
705         if (mnt[i].deps && !automount->language_deps) {
706             continue;
707         }
708 
709         if (nxt_slow_path(mnt[i].type == NXT_FS_BIND
710                           && stat((const char *) mnt[i].src, &st) != 0))
711         {
712             nxt_log(task, NXT_LOG_WARN, "host path not found: %s", mnt[i].src);
713             continue;
714         }
715 
716         ret = nxt_fs_mkdir_all(dst, S_IRWXU | S_IRWXG | S_IRWXO);
717         if (nxt_slow_path(ret != NXT_OK)) {
718             nxt_alert(task, "mkdir(%s) %E", dst, nxt_errno);
719             goto undo;
720         }
721 
722         ret = nxt_fs_mount(task, &mnt[i]);
723         if (nxt_slow_path(ret != NXT_OK)) {
724             goto undo;
725         }
726     }
727 
728     return NXT_OK;
729 
730 undo:
731 
732     n = i + 1;
733 
734     for (i = 0; i < n; i++) {
735         nxt_fs_unmount(mnt[i].dst);
736     }
737 
738     return NXT_ERROR;
739 }
740 
741 
742 #if (NXT_HAVE_PIVOT_ROOT) && (NXT_HAVE_CLONE_NEWNS)
743 
744 nxt_int_t
745 nxt_isolation_change_root(nxt_task_t *task, nxt_process_t *process)
746 {
747     char       *rootfs;
748     nxt_int_t  ret;
749 
750     rootfs = (char *) process->isolation.rootfs;
751 
752     nxt_debug(task, "change root: %s", rootfs);
753 
754     if (nxt_is_clone_flag_set(process->isolation.clone.flags, NEWNS)) {
755         ret = nxt_isolation_pivot_root(task, rootfs);
756 
757     } else {
758         ret = nxt_isolation_chroot(task, rootfs);
759     }
760 
761     if (nxt_fast_path(ret == NXT_OK)) {
762         if (nxt_slow_path(chdir("/") < 0)) {
763             nxt_alert(task, "chdir(\"/\") %E", nxt_errno);
764             return NXT_ERROR;
765         }
766     }
767 
768     return ret;
769 }
770 
771 
772 /*
773  * pivot_root(2) can only be safely used with containers, otherwise it can
774  * umount(2) the global root filesystem and screw up the machine.
775  */
776 
777 static nxt_int_t
778 nxt_isolation_pivot_root(nxt_task_t *task, const char *path)
779 {
780     /*
781      * This implementation makes use of a kernel trick that works for ages
782      * and now documented in Linux kernel 5.
783      * https://lore.kernel.org/linux-man/87r24piwhm.fsf@x220.int.ebiederm.org/T/
784      */
785 
786     if (nxt_slow_path(mount("", "/", "", MS_SLAVE|MS_REC, "") != 0)) {
787         nxt_alert(task, "mount(\"/\", MS_SLAVE|MS_REC) failed: %E", nxt_errno);
788         return NXT_ERROR;
789     }
790 
791     if (nxt_slow_path(nxt_isolation_make_private_mount(task, path) != NXT_OK)) {
792         return NXT_ERROR;
793     }
794 
795     if (nxt_slow_path(mount(path, path, "bind", MS_BIND|MS_REC, "") != 0)) {
796         nxt_alert(task, "error bind mounting rootfs %E", nxt_errno);
797         return NXT_ERROR;
798     }
799 
800     if (nxt_slow_path(chdir(path) != 0)) {
801         nxt_alert(task, "failed to chdir(%s) %E", path, nxt_errno);
802         return NXT_ERROR;
803     }
804 
805     if (nxt_slow_path(nxt_pivot_root(".", ".") != 0)) {
806         nxt_alert(task, "failed to pivot_root %E", nxt_errno);
807         return NXT_ERROR;
808     }
809 
810     /*
811      * Demote the oldroot mount to avoid unmounts getting propagated to
812      * the host.
813      */
814     if (nxt_slow_path(mount("", ".", "", MS_SLAVE | MS_REC, NULL) != 0)) {
815         nxt_alert(task, "failed to bind mount rootfs %E", nxt_errno);
816         return NXT_ERROR;
817     }
818 
819     if (nxt_slow_path(umount2(".", MNT_DETACH) != 0)) {
820         nxt_alert(task, "failed to umount old root directory %E", nxt_errno);
821         return NXT_ERROR;
822     }
823 
824     return NXT_OK;
825 }
826 
827 
828 static nxt_int_t
829 nxt_isolation_make_private_mount(nxt_task_t *task, const char *rootfs)
830 {
831     char           *parent_mnt;
832     FILE           *procfile;
833     u_char         **mounts;
834     size_t         len;
835     uint8_t        *shared;
836     nxt_int_t      ret, index, nmounts;
837     struct mntent  *ent;
838 
839     static const char  *mount_path = "/proc/self/mounts";
840 
841     ret = NXT_ERROR;
842     ent = NULL;
843     shared = NULL;
844     procfile = NULL;
845     parent_mnt = NULL;
846 
847     nmounts = 256;
848 
849     mounts = nxt_malloc(nmounts * sizeof(uintptr_t));
850     if (nxt_slow_path(mounts == NULL)) {
851         goto fail;
852     }
853 
854     shared = nxt_malloc(nmounts);
855     if (nxt_slow_path(shared == NULL)) {
856         goto fail;
857     }
858 
859     procfile = setmntent(mount_path, "r");
860     if (nxt_slow_path(procfile == NULL)) {
861         nxt_alert(task, "failed to open %s %E", mount_path, nxt_errno);
862 
863         goto fail;
864     }
865 
866     index = 0;
867 
868 again:
869 
870     for ( ; index < nmounts; index++) {
871         ent = getmntent(procfile);
872         if (ent == NULL) {
873             nmounts = index;
874             break;
875         }
876 
877         mounts[index] = (u_char *) strdup(ent->mnt_dir);
878         shared[index] = hasmntopt(ent, "shared") != NULL;
879     }
880 
881     if (ent != NULL) {
882         /* there are still entries to be read */
883 
884         nmounts *= 2;
885         mounts = nxt_realloc(mounts, nmounts);
886         if (nxt_slow_path(mounts == NULL)) {
887             goto fail;
888         }
889 
890         shared = nxt_realloc(shared, nmounts);
891         if (nxt_slow_path(shared == NULL)) {
892             goto fail;
893         }
894 
895         goto again;
896     }
897 
898     for (index = 0; index < nmounts; index++) {
899         if (nxt_strcmp(mounts[index], rootfs) == 0) {
900             parent_mnt = (char *) rootfs;
901             break;
902         }
903     }
904 
905     if (parent_mnt == NULL) {
906         len = nxt_strlen(rootfs);
907 
908         parent_mnt = nxt_malloc(len + 1);
909         if (parent_mnt == NULL) {
910             goto fail;
911         }
912 
913         nxt_memcpy(parent_mnt, rootfs, len);
914         parent_mnt[len] = '\0';
915 
916         if (parent_mnt[len - 1] == '/') {
917             parent_mnt[len - 1] = '\0';
918             len--;
919         }
920 
921         for ( ;; ) {
922             for (index = 0; index < nmounts; index++) {
923                 if (nxt_strcmp(mounts[index], parent_mnt) == 0) {
924                     goto found;
925                 }
926             }
927 
928             if (len == 1 && parent_mnt[0] == '/') {
929                 nxt_alert(task, "parent mount not found");
930                 goto fail;
931             }
932 
933             /* parent dir */
934             while (parent_mnt[len - 1] != '/' && len > 0) {
935                 len--;
936             }
937 
938             if (nxt_slow_path(len == 0)) {
939                 nxt_alert(task, "parent mount not found");
940                 goto fail;
941             }
942 
943             if (len == 1) {
944                 parent_mnt[len] = '\0';     /* / */
945             } else {
946                 parent_mnt[len - 1] = '\0'; /* /<path> */
947             }
948         }
949     }
950 
951 found:
952 
953     if (shared[index]) {
954         if (nxt_slow_path(mount("", parent_mnt, "", MS_PRIVATE, "") != 0)) {
955             nxt_alert(task, "mount(\"\", \"%s\", MS_PRIVATE) %E", parent_mnt,
956                       nxt_errno);
957 
958             goto fail;
959         }
960     }
961 
962     ret = NXT_OK;
963 
964 fail:
965 
966     if (procfile != NULL) {
967         endmntent(procfile);
968     }
969 
970     if (mounts != NULL) {
971         for (index = 0; index < nmounts; index++) {
972             nxt_free(mounts[index]);
973         }
974 
975         nxt_free(mounts);
976     }
977 
978     if (shared != NULL) {
979         nxt_free(shared);
980     }
981 
982     if (parent_mnt != NULL && parent_mnt != rootfs) {
983         nxt_free(parent_mnt);
984     }
985 
986     return ret;
987 }
988 
989 
990 nxt_inline int
991 nxt_pivot_root(const char *new_root, const char *old_root)
992 {
993     return syscall(__NR_pivot_root, new_root, old_root);
994 }
995 
996 
997 #else /* !(NXT_HAVE_PIVOT_ROOT) || !(NXT_HAVE_CLONE_NEWNS) */
998 
999 
1000 nxt_int_t
1001 nxt_isolation_change_root(nxt_task_t *task, nxt_process_t *process)
1002 {
1003     char       *rootfs;
1004 
1005     rootfs = (char *) process->isolation.rootfs;
1006 
1007     nxt_debug(task, "change root: %s", rootfs);
1008 
1009     if (nxt_fast_path(nxt_isolation_chroot(task, rootfs) == NXT_OK)) {
1010         if (nxt_slow_path(chdir("/") < 0)) {
1011             nxt_alert(task, "chdir(\"/\") %E", nxt_errno);
1012             return NXT_ERROR;
1013         }
1014 
1015         return NXT_OK;
1016     }
1017 
1018     return NXT_ERROR;
1019 }
1020 
1021 #endif
1022 
1023 
1024 static nxt_int_t
1025 nxt_isolation_chroot(nxt_task_t *task, const char *path)
1026 {
1027     if (nxt_slow_path(chroot(path) < 0)) {
1028         nxt_alert(task, "chroot(%s) %E", path, nxt_errno);
1029         return NXT_ERROR;
1030     }
1031 
1032     return NXT_OK;
1033 }
1034 
1035 #endif /* NXT_HAVE_ISOLATION_ROOTFS */
1036 
1037 
1038 #if (NXT_HAVE_PR_SET_NO_NEW_PRIVS)
1039 
1040 static nxt_int_t
1041 nxt_isolation_set_new_privs(nxt_task_t *task, nxt_conf_value_t *isolation,
1042     nxt_process_t *process)
1043 {
1044     nxt_conf_value_t  *obj;
1045 
1046     static nxt_str_t  new_privs_name = nxt_string("new_privs");
1047 
1048     obj = nxt_conf_get_object_member(isolation, &new_privs_name, NULL);
1049     if (obj != NULL) {
1050         process->isolation.new_privs = nxt_conf_get_boolean(obj);
1051     }
1052 
1053     return NXT_OK;
1054 }
1055 
1056 #endif
1057