xref: /unit/src/nxt_isolation.c (revision 1585:e941d77852d1)
1 /*
2  * Copyright (C) NGINX, Inc.
3  */
4 
5 #include <nxt_main.h>
6 #include <nxt_application.h>
7 #include <nxt_process.h>
8 #include <nxt_isolation.h>
9 
10 #if (NXT_HAVE_PIVOT_ROOT)
11 #include <mntent.h>
12 #endif
13 
14 
15 static nxt_int_t nxt_isolation_set(nxt_task_t *task,
16     nxt_conf_value_t *isolation, nxt_process_t *process);
17 
18 #if (NXT_HAVE_CLONE)
19 static nxt_int_t nxt_isolation_set_namespaces(nxt_task_t *task,
20     nxt_conf_value_t *isolation, nxt_process_t *process);
21 static nxt_int_t nxt_isolation_clone_flags(nxt_task_t *task,
22     nxt_conf_value_t *namespaces, nxt_clone_t *clone);
23 #endif
24 
25 #if (NXT_HAVE_CLONE_NEWUSER)
26 static nxt_int_t nxt_isolation_set_creds(nxt_task_t *task,
27     nxt_conf_value_t *isolation, nxt_process_t *process);
28 static nxt_int_t nxt_isolation_credential_map(nxt_task_t *task,
29     nxt_mp_t *mem_pool, nxt_conf_value_t *map_array,
30     nxt_clone_credential_map_t *map);
31 static nxt_int_t nxt_isolation_vldt_creds(nxt_task_t *task,
32     nxt_process_t *process);
33 #endif
34 
35 #if (NXT_HAVE_ISOLATION_ROOTFS)
36 static nxt_int_t nxt_isolation_set_rootfs(nxt_task_t *task,
37     nxt_conf_value_t *isolation, nxt_process_t *process);
38 static nxt_int_t nxt_isolation_set_automount(nxt_task_t *task,
39     nxt_conf_value_t *isolation, nxt_process_t *process);
40 static nxt_int_t nxt_isolation_set_mounts(nxt_task_t *task,
41     nxt_process_t *process, nxt_str_t *app_type);
42 static nxt_int_t nxt_isolation_set_lang_mounts(nxt_task_t *task,
43     nxt_process_t *process, nxt_array_t *syspaths);
44 static void nxt_isolation_unmount_all(nxt_task_t *task, nxt_process_t *process);
45 
46 #if (NXT_HAVE_PIVOT_ROOT) && (NXT_HAVE_CLONE_NEWNS)
47 static nxt_int_t nxt_isolation_pivot_root(nxt_task_t *task, const char *rootfs);
48 static nxt_int_t nxt_isolation_make_private_mount(nxt_task_t *task,
49     const char *rootfs);
50 nxt_inline int nxt_pivot_root(const char *new_root, const char *old_root);
51 #endif
52 
53 static nxt_int_t nxt_isolation_chroot(nxt_task_t *task, const char *path);
54 #endif
55 
56 #if (NXT_HAVE_PR_SET_NO_NEW_PRIVS)
57 static nxt_int_t nxt_isolation_set_new_privs(nxt_task_t *task,
58     nxt_conf_value_t *isolation, nxt_process_t *process);
59 #endif
60 
61 
62 nxt_int_t
63 nxt_isolation_main_prefork(nxt_task_t *task, nxt_process_t *process,
64     nxt_mp_t *mp)
65 {
66     nxt_int_t              cap_setid;
67     nxt_int_t              ret;
68     nxt_runtime_t          *rt;
69     nxt_common_app_conf_t  *app_conf;
70 
71     rt = task->thread->runtime;
72     app_conf = process->data.app;
73     cap_setid = rt->capabilities.setid;
74 
75     if (app_conf->isolation != NULL) {
76         ret = nxt_isolation_set(task, app_conf->isolation, process);
77         if (nxt_slow_path(ret != NXT_OK)) {
78             return ret;
79         }
80     }
81 
82 #if (NXT_HAVE_CLONE_NEWUSER)
83     if (nxt_is_clone_flag_set(process->isolation.clone.flags, NEWUSER)) {
84         cap_setid = 1;
85     }
86 #endif
87 
88 #if (NXT_HAVE_ISOLATION_ROOTFS)
89     if (process->isolation.rootfs != NULL) {
90         ret = nxt_isolation_set_mounts(task, process, &app_conf->type);
91         if (nxt_slow_path(ret != NXT_OK)) {
92             return ret;
93         }
94     }
95 #endif
96 
97     if (cap_setid) {
98         ret = nxt_process_creds_set(task, process, &app_conf->user,
99                                     &app_conf->group);
100 
101         if (nxt_slow_path(ret != NXT_OK)) {
102             return ret;
103         }
104 
105     } else {
106         if (!nxt_str_eq(&app_conf->user, (u_char *) rt->user_cred.user,
107                         nxt_strlen(rt->user_cred.user)))
108         {
109             nxt_alert(task, "cannot set user \"%V\" for app \"%V\": "
110                       "missing capabilities", &app_conf->user, &app_conf->name);
111 
112             return NXT_ERROR;
113         }
114 
115         if (app_conf->group.length > 0
116             && !nxt_str_eq(&app_conf->group, (u_char *) rt->group,
117                            nxt_strlen(rt->group)))
118         {
119             nxt_alert(task, "cannot set group \"%V\" for app \"%V\": "
120                             "missing capabilities", &app_conf->group,
121                             &app_conf->name);
122 
123             return NXT_ERROR;
124         }
125     }
126 
127 #if (NXT_HAVE_CLONE_NEWUSER)
128     ret = nxt_isolation_vldt_creds(task, process);
129     if (nxt_slow_path(ret != NXT_OK)) {
130         return ret;
131     }
132 #endif
133 
134     return NXT_OK;
135 }
136 
137 
138 static nxt_int_t
139 nxt_isolation_set(nxt_task_t *task, nxt_conf_value_t *isolation,
140     nxt_process_t *process)
141 {
142 #if (NXT_HAVE_CLONE)
143     if (nxt_slow_path(nxt_isolation_set_namespaces(task, isolation, process)
144                       != NXT_OK))
145     {
146         return NXT_ERROR;
147     }
148 #endif
149 
150 #if (NXT_HAVE_CLONE_NEWUSER)
151     if (nxt_slow_path(nxt_isolation_set_creds(task, isolation, process)
152                       != NXT_OK))
153     {
154         return NXT_ERROR;
155     }
156 #endif
157 
158 #if (NXT_HAVE_ISOLATION_ROOTFS)
159     if (nxt_slow_path(nxt_isolation_set_rootfs(task, isolation, process)
160                       != NXT_OK))
161     {
162         return NXT_ERROR;
163     }
164 
165     if (nxt_slow_path(nxt_isolation_set_automount(task, isolation, process)
166                       != NXT_OK))
167     {
168         return NXT_ERROR;
169     }
170 #endif
171 
172 #if (NXT_HAVE_PR_SET_NO_NEW_PRIVS)
173     if (nxt_slow_path(nxt_isolation_set_new_privs(task, isolation, process)
174                       != NXT_OK))
175     {
176         return NXT_ERROR;
177     }
178 #endif
179 
180     return NXT_OK;
181 }
182 
183 
184 #if (NXT_HAVE_CLONE)
185 
186 static nxt_int_t
187 nxt_isolation_set_namespaces(nxt_task_t *task, nxt_conf_value_t *isolation,
188     nxt_process_t *process)
189 {
190     nxt_int_t         ret;
191     nxt_conf_value_t  *obj;
192 
193     static nxt_str_t  nsname = nxt_string("namespaces");
194 
195     obj = nxt_conf_get_object_member(isolation, &nsname, NULL);
196     if (obj != NULL) {
197         ret = nxt_isolation_clone_flags(task, obj, &process->isolation.clone);
198         if (nxt_slow_path(ret != NXT_OK)) {
199             return NXT_ERROR;
200         }
201     }
202 
203     return NXT_OK;
204 }
205 
206 #endif
207 
208 
209 #if (NXT_HAVE_CLONE_NEWUSER)
210 
211 static nxt_int_t
212 nxt_isolation_set_creds(nxt_task_t *task, nxt_conf_value_t *isolation,
213     nxt_process_t *process)
214 {
215     nxt_int_t         ret;
216     nxt_clone_t       *clone;
217     nxt_conf_value_t  *array;
218 
219     static nxt_str_t uidname = nxt_string("uidmap");
220     static nxt_str_t gidname = nxt_string("gidmap");
221 
222     clone = &process->isolation.clone;
223 
224     array = nxt_conf_get_object_member(isolation, &uidname, NULL);
225     if (array != NULL) {
226         ret = nxt_isolation_credential_map(task, process->mem_pool, array,
227                                            &clone->uidmap);
228 
229         if (nxt_slow_path(ret != NXT_OK)) {
230             return NXT_ERROR;
231         }
232     }
233 
234     array = nxt_conf_get_object_member(isolation, &gidname, NULL);
235     if (array != NULL) {
236         ret = nxt_isolation_credential_map(task, process->mem_pool, array,
237                                            &clone->gidmap);
238 
239         if (nxt_slow_path(ret != NXT_OK)) {
240             return NXT_ERROR;
241         }
242     }
243 
244     return NXT_OK;
245 }
246 
247 
248 static nxt_int_t
249 nxt_isolation_credential_map(nxt_task_t *task, nxt_mp_t *mp,
250     nxt_conf_value_t *map_array, nxt_clone_credential_map_t *map)
251 {
252     nxt_int_t         ret;
253     nxt_uint_t        i;
254     nxt_conf_value_t  *obj;
255 
256     static nxt_conf_map_t  nxt_clone_map_entry_conf[] = {
257         {
258             nxt_string("container"),
259             NXT_CONF_MAP_INT,
260             offsetof(nxt_clone_map_entry_t, container),
261         },
262 
263         {
264             nxt_string("host"),
265             NXT_CONF_MAP_INT,
266             offsetof(nxt_clone_map_entry_t, host),
267         },
268 
269         {
270             nxt_string("size"),
271             NXT_CONF_MAP_INT,
272             offsetof(nxt_clone_map_entry_t, size),
273         },
274     };
275 
276     map->size = nxt_conf_array_elements_count(map_array);
277 
278     if (map->size == 0) {
279         return NXT_OK;
280     }
281 
282     map->map = nxt_mp_alloc(mp, map->size * sizeof(nxt_clone_map_entry_t));
283     if (nxt_slow_path(map->map == NULL)) {
284         return NXT_ERROR;
285     }
286 
287     for (i = 0; i < map->size; i++) {
288         obj = nxt_conf_get_array_element(map_array, i);
289 
290         ret = nxt_conf_map_object(mp, obj, nxt_clone_map_entry_conf,
291                                   nxt_nitems(nxt_clone_map_entry_conf),
292                                   map->map + i);
293         if (nxt_slow_path(ret != NXT_OK)) {
294             nxt_alert(task, "clone map entry map error");
295             return NXT_ERROR;
296         }
297     }
298 
299     return NXT_OK;
300 }
301 
302 
303 static nxt_int_t
304 nxt_isolation_vldt_creds(nxt_task_t *task, nxt_process_t *process)
305 {
306     nxt_int_t         ret;
307     nxt_clone_t       *clone;
308     nxt_credential_t  *creds;
309 
310     clone = &process->isolation.clone;
311     creds = process->user_cred;
312 
313     if (clone->uidmap.size == 0 && clone->gidmap.size == 0) {
314         return NXT_OK;
315     }
316 
317     if (!nxt_is_clone_flag_set(clone->flags, NEWUSER)) {
318         if (nxt_slow_path(clone->uidmap.size > 0)) {
319             nxt_log(task, NXT_LOG_ERR, "\"uidmap\" is set but "
320                     "\"isolation.namespaces.credential\" is false or unset");
321 
322             return NXT_ERROR;
323         }
324 
325         if (nxt_slow_path(clone->gidmap.size > 0)) {
326             nxt_log(task, NXT_LOG_ERR, "\"gidmap\" is set but "
327                     "\"isolation.namespaces.credential\" is false or unset");
328 
329             return NXT_ERROR;
330         }
331 
332         return NXT_OK;
333     }
334 
335     ret = nxt_clone_vldt_credential_uidmap(task, &clone->uidmap, creds);
336     if (nxt_slow_path(ret != NXT_OK)) {
337         return NXT_ERROR;
338     }
339 
340     return nxt_clone_vldt_credential_gidmap(task, &clone->gidmap, creds);
341 }
342 
343 #endif
344 
345 
346 #if (NXT_HAVE_CLONE)
347 
348 static nxt_int_t
349 nxt_isolation_clone_flags(nxt_task_t *task, nxt_conf_value_t *namespaces,
350     nxt_clone_t *clone)
351 {
352     uint32_t          index;
353     nxt_str_t         name;
354     nxt_int_t         flag;
355     nxt_conf_value_t  *value;
356 
357     index = 0;
358 
359     for ( ;; ) {
360         value = nxt_conf_next_object_member(namespaces, &name, &index);
361 
362         if (value == NULL) {
363             break;
364         }
365 
366         flag = 0;
367 
368 #if (NXT_HAVE_CLONE_NEWUSER)
369         if (nxt_str_eq(&name, "credential", 10)) {
370             flag = CLONE_NEWUSER;
371         }
372 #endif
373 
374 #if (NXT_HAVE_CLONE_NEWPID)
375         if (nxt_str_eq(&name, "pid", 3)) {
376             flag = CLONE_NEWPID;
377         }
378 #endif
379 
380 #if (NXT_HAVE_CLONE_NEWNET)
381         if (nxt_str_eq(&name, "network", 7)) {
382             flag = CLONE_NEWNET;
383         }
384 #endif
385 
386 #if (NXT_HAVE_CLONE_NEWUTS)
387         if (nxt_str_eq(&name, "uname", 5)) {
388             flag = CLONE_NEWUTS;
389         }
390 #endif
391 
392 #if (NXT_HAVE_CLONE_NEWNS)
393         if (nxt_str_eq(&name, "mount", 5)) {
394             flag = CLONE_NEWNS;
395         }
396 #endif
397 
398 #if (NXT_HAVE_CLONE_NEWCGROUP)
399         if (nxt_str_eq(&name, "cgroup", 6)) {
400             flag = CLONE_NEWCGROUP;
401         }
402 #endif
403 
404         if (!flag) {
405             nxt_alert(task, "unknown namespace flag: \"%V\"", &name);
406             return NXT_ERROR;
407         }
408 
409         if (nxt_conf_get_boolean(value)) {
410             clone->flags |= flag;
411         }
412     }
413 
414     return NXT_OK;
415 }
416 
417 #endif
418 
419 
420 #if (NXT_HAVE_ISOLATION_ROOTFS)
421 
422 static nxt_int_t
423 nxt_isolation_set_rootfs(nxt_task_t *task, nxt_conf_value_t *isolation,
424     nxt_process_t *process)
425 {
426     nxt_str_t         str;
427     nxt_conf_value_t  *obj;
428 
429     static nxt_str_t  rootfs_name = nxt_string("rootfs");
430 
431     obj = nxt_conf_get_object_member(isolation, &rootfs_name, NULL);
432     if (obj != NULL) {
433         nxt_conf_get_string(obj, &str);
434 
435         if (nxt_slow_path(str.length <= 1 || str.start[0] != '/')) {
436             nxt_log(task, NXT_LOG_ERR, "rootfs requires an absolute path other "
437                     "than \"/\" but given \"%V\"", &str);
438 
439             return NXT_ERROR;
440         }
441 
442         if (str.start[str.length - 1] == '/') {
443             str.length--;
444         }
445 
446         process->isolation.rootfs = nxt_mp_alloc(process->mem_pool,
447                                                  str.length + 1);
448 
449         if (nxt_slow_path(process->isolation.rootfs == NULL)) {
450             return NXT_ERROR;
451         }
452 
453         nxt_memcpy(process->isolation.rootfs, str.start, str.length);
454 
455         process->isolation.rootfs[str.length] = '\0';
456     }
457 
458     return NXT_OK;
459 }
460 
461 
462 static nxt_int_t
463 nxt_isolation_set_automount(nxt_task_t *task, nxt_conf_value_t *isolation,
464     nxt_process_t *process)
465 {
466     nxt_conf_value_t         *conf, *value;
467     nxt_process_automount_t  *automount;
468 
469     static nxt_str_t  automount_name = nxt_string("automount");
470     static nxt_str_t  langdeps_name = nxt_string("language_deps");
471 
472     automount = &process->isolation.automount;
473 
474     automount->language_deps = 1;
475 
476     conf = nxt_conf_get_object_member(isolation, &automount_name, NULL);
477     if (conf != NULL) {
478         value = nxt_conf_get_object_member(conf, &langdeps_name, NULL);
479         if (value != NULL) {
480             automount->language_deps = nxt_conf_get_boolean(value);
481         }
482     }
483 
484     return NXT_OK;
485 }
486 
487 
488 static nxt_int_t
489 nxt_isolation_set_mounts(nxt_task_t *task, nxt_process_t *process,
490     nxt_str_t *app_type)
491 {
492     nxt_int_t              ret, cap_chroot;
493     nxt_runtime_t          *rt;
494     nxt_app_lang_module_t  *lang;
495 
496     rt = task->thread->runtime;
497     cap_chroot = rt->capabilities.chroot;
498     lang = nxt_app_lang_module(rt, app_type);
499 
500     nxt_assert(lang != NULL);
501 
502 #if (NXT_HAVE_CLONE_NEWUSER)
503     if (nxt_is_clone_flag_set(process->isolation.clone.flags, NEWUSER)) {
504         cap_chroot = 1;
505     }
506 #endif
507 
508     if (!cap_chroot) {
509         nxt_log(task, NXT_LOG_ERR, "The \"rootfs\" field requires privileges");
510         return NXT_ERROR;
511     }
512 
513     ret = nxt_isolation_set_lang_mounts(task, process, lang->mounts);
514     if (nxt_slow_path(ret != NXT_OK)) {
515         return NXT_ERROR;
516     }
517 
518     process->isolation.cleanup = nxt_isolation_unmount_all;
519 
520     return NXT_OK;
521 }
522 
523 
524 static nxt_int_t
525 nxt_isolation_set_lang_mounts(nxt_task_t *task, nxt_process_t *process,
526     nxt_array_t *lang_mounts)
527 {
528     u_char          *p;
529     size_t          i, n, rootfs_len, len;
530     nxt_mp_t        *mp;
531     nxt_array_t     *mounts;
532     const u_char    *rootfs;
533     nxt_fs_mount_t  *mnt, *lang_mnt;
534 
535     mp = process->mem_pool;
536 
537     /* copy to init mem pool */
538     mounts = nxt_array_copy(mp, NULL, lang_mounts);
539     if (mounts == NULL) {
540         return NXT_ERROR;
541     }
542 
543     n = mounts->nelts;
544     mnt = mounts->elts;
545     lang_mnt = lang_mounts->elts;
546 
547     rootfs = process->isolation.rootfs;
548     rootfs_len = nxt_strlen(rootfs);
549 
550     for (i = 0; i < n; i++) {
551         len = nxt_strlen(lang_mnt[i].dst);
552 
553         mnt[i].dst = nxt_mp_alloc(mp, rootfs_len + len + 1);
554         if (nxt_slow_path(mnt[i].dst == NULL)) {
555             return NXT_ERROR;
556         }
557 
558         p = nxt_cpymem(mnt[i].dst, rootfs, rootfs_len);
559         p = nxt_cpymem(p, lang_mnt[i].dst, len);
560         *p = '\0';
561     }
562 
563     mnt = nxt_array_add(mounts);
564     if (nxt_slow_path(mnt == NULL)) {
565         return NXT_ERROR;
566     }
567 
568     mnt->src = (u_char *) "tmpfs";
569     mnt->fstype = (u_char *) "tmpfs";
570     mnt->flags = NXT_MS_NOSUID | NXT_MS_NODEV | NXT_MS_NOEXEC | NXT_MS_RELATIME;
571     mnt->data = (u_char *) "size=1m,mode=777";
572     mnt->builtin = 1;
573 
574     mnt->dst = nxt_mp_nget(mp, rootfs_len + nxt_length("/tmp") + 1);
575     if (nxt_slow_path(mnt->dst == NULL)) {
576         return NXT_ERROR;
577     }
578 
579     p = nxt_cpymem(mnt->dst, rootfs, rootfs_len);
580     p = nxt_cpymem(p, "/tmp", 4);
581     *p = '\0';
582 
583 #if (NXT_HAVE_CLONE_NEWPID) && (NXT_HAVE_CLONE_NEWNS)
584 
585     if (nxt_is_clone_flag_set(process->isolation.clone.flags, NEWPID)
586         && nxt_is_clone_flag_set(process->isolation.clone.flags, NEWNS))
587     {
588         mnt = nxt_array_add(mounts);
589         if (nxt_slow_path(mnt == NULL)) {
590             return NXT_ERROR;
591         }
592 
593         mnt->fstype = (u_char *) "proc";
594         mnt->src = (u_char *) "proc";
595 
596         mnt->dst = nxt_mp_nget(mp, rootfs_len + nxt_length("/proc") + 1);
597         if (nxt_slow_path(mnt->dst == NULL)) {
598             return NXT_ERROR;
599         }
600 
601         p = nxt_cpymem(mnt->dst, rootfs, rootfs_len);
602         p = nxt_cpymem(p, "/proc", 5);
603         *p = '\0';
604 
605         mnt->data = (u_char *) "";
606         mnt->flags = 0;
607     }
608 #endif
609 
610     process->isolation.mounts = mounts;
611 
612     return NXT_OK;
613 }
614 
615 
616 void
617 nxt_isolation_unmount_all(nxt_task_t *task, nxt_process_t *process)
618 {
619     size_t                   i, n;
620     nxt_array_t              *mounts;
621     nxt_fs_mount_t           *mnt;
622     nxt_process_automount_t  *automount;
623 
624     nxt_debug(task, "unmount all (%s)", process->name);
625 
626     automount = &process->isolation.automount;
627     mounts = process->isolation.mounts;
628     n = mounts->nelts;
629     mnt = mounts->elts;
630 
631     for (i = 0; i < n; i++) {
632         if (mnt[i].builtin && !automount->language_deps) {
633             continue;
634         }
635 
636         nxt_fs_unmount(mnt[i].dst);
637     }
638 }
639 
640 
641 nxt_int_t
642 nxt_isolation_prepare_rootfs(nxt_task_t *task, nxt_process_t *process)
643 {
644     size_t                   i, n;
645     nxt_int_t                ret;
646     struct stat              st;
647     nxt_array_t              *mounts;
648     const u_char             *dst;
649     nxt_fs_mount_t           *mnt;
650     nxt_process_automount_t  *automount;
651 
652     automount = &process->isolation.automount;
653     mounts = process->isolation.mounts;
654 
655     n = mounts->nelts;
656     mnt = mounts->elts;
657 
658     for (i = 0; i < n; i++) {
659         dst = mnt[i].dst;
660 
661         if (mnt[i].builtin && !automount->language_deps) {
662             continue;
663         }
664 
665         if (nxt_slow_path(nxt_memcmp(mnt[i].fstype, "bind", 4) == 0
666                           && stat((const char *) mnt[i].src, &st) != 0))
667         {
668             nxt_log(task, NXT_LOG_WARN, "host path not found: %s", mnt[i].src);
669             continue;
670         }
671 
672         ret = nxt_fs_mkdir_all(dst, S_IRWXU | S_IRWXG | S_IRWXO);
673         if (nxt_slow_path(ret != NXT_OK)) {
674             nxt_alert(task, "mkdir(%s) %E", dst, nxt_errno);
675             goto undo;
676         }
677 
678         ret = nxt_fs_mount(task, &mnt[i]);
679         if (nxt_slow_path(ret != NXT_OK)) {
680             goto undo;
681         }
682     }
683 
684     return NXT_OK;
685 
686 undo:
687 
688     n = i + 1;
689 
690     for (i = 0; i < n; i++) {
691         nxt_fs_unmount(mnt[i].dst);
692     }
693 
694     return NXT_ERROR;
695 }
696 
697 
698 #if (NXT_HAVE_PIVOT_ROOT) && (NXT_HAVE_CLONE_NEWNS)
699 
700 nxt_int_t
701 nxt_isolation_change_root(nxt_task_t *task, nxt_process_t *process)
702 {
703     char       *rootfs;
704     nxt_int_t  ret;
705 
706     rootfs = (char *) process->isolation.rootfs;
707 
708     nxt_debug(task, "change root: %s", rootfs);
709 
710     if (NXT_CLONE_MNT(process->isolation.clone.flags)) {
711         ret = nxt_isolation_pivot_root(task, rootfs);
712 
713     } else {
714         ret = nxt_isolation_chroot(task, rootfs);
715     }
716 
717     if (nxt_fast_path(ret == NXT_OK)) {
718         if (nxt_slow_path(chdir("/") < 0)) {
719             nxt_alert(task, "chdir(\"/\") %E", nxt_errno);
720             return NXT_ERROR;
721         }
722     }
723 
724     return ret;
725 }
726 
727 
728 /*
729  * pivot_root(2) can only be safely used with containers, otherwise it can
730  * umount(2) the global root filesystem and screw up the machine.
731  */
732 
733 static nxt_int_t
734 nxt_isolation_pivot_root(nxt_task_t *task, const char *path)
735 {
736     /*
737      * This implementation makes use of a kernel trick that works for ages
738      * and now documented in Linux kernel 5.
739      * https://lore.kernel.org/linux-man/87r24piwhm.fsf@x220.int.ebiederm.org/T/
740      */
741 
742     if (nxt_slow_path(mount("", "/", "", MS_SLAVE|MS_REC, "") != 0)) {
743         nxt_alert(task, "failed to make / a slave mount %E", nxt_errno);
744         return NXT_ERROR;
745     }
746 
747     if (nxt_slow_path(nxt_isolation_make_private_mount(task, path) != NXT_OK)) {
748         return NXT_ERROR;
749     }
750 
751     if (nxt_slow_path(mount(path, path, "bind", MS_BIND|MS_REC, "") != 0)) {
752         nxt_alert(task, "error bind mounting rootfs %E", nxt_errno);
753         return NXT_ERROR;
754     }
755 
756     if (nxt_slow_path(chdir(path) != 0)) {
757         nxt_alert(task, "failed to chdir(%s) %E", path, nxt_errno);
758         return NXT_ERROR;
759     }
760 
761     if (nxt_slow_path(nxt_pivot_root(".", ".") != 0)) {
762         nxt_alert(task, "failed to pivot_root %E", nxt_errno);
763         return NXT_ERROR;
764     }
765 
766     /*
767      * Make oldroot a slave mount to avoid unmounts getting propagated to the
768      * host.
769      */
770     if (nxt_slow_path(mount("", ".", "", MS_SLAVE | MS_REC, NULL) != 0)) {
771         nxt_alert(task, "failed to bind mount rootfs %E", nxt_errno);
772         return NXT_ERROR;
773     }
774 
775     if (nxt_slow_path(umount2(".", MNT_DETACH) != 0)) {
776         nxt_alert(task, "failed to umount old root directory %E", nxt_errno);
777         return NXT_ERROR;
778     }
779 
780     return NXT_OK;
781 }
782 
783 
784 static nxt_int_t
785 nxt_isolation_make_private_mount(nxt_task_t *task, const char *rootfs)
786 {
787     char           *parent_mnt;
788     FILE           *procfile;
789     u_char         **mounts;
790     size_t         len;
791     uint8_t        *shared;
792     nxt_int_t      ret, index, nmounts;
793     struct mntent  *ent;
794 
795     static const char  *mount_path = "/proc/self/mounts";
796 
797     ret = NXT_ERROR;
798     ent = NULL;
799     shared = NULL;
800     procfile = NULL;
801     parent_mnt = NULL;
802 
803     nmounts = 256;
804 
805     mounts = nxt_malloc(nmounts * sizeof(uintptr_t));
806     if (nxt_slow_path(mounts == NULL)) {
807         goto fail;
808     }
809 
810     shared = nxt_malloc(nmounts);
811     if (nxt_slow_path(shared == NULL)) {
812         goto fail;
813     }
814 
815     procfile = setmntent(mount_path, "r");
816     if (nxt_slow_path(procfile == NULL)) {
817         nxt_alert(task, "failed to open %s %E", mount_path, nxt_errno);
818 
819         goto fail;
820     }
821 
822     index = 0;
823 
824 again:
825 
826     for ( ; index < nmounts; index++) {
827         ent = getmntent(procfile);
828         if (ent == NULL) {
829             nmounts = index;
830             break;
831         }
832 
833         mounts[index] = (u_char *) strdup(ent->mnt_dir);
834         shared[index] = hasmntopt(ent, "shared") != NULL;
835     }
836 
837     if (ent != NULL) {
838         /* there are still entries to be read */
839 
840         nmounts *= 2;
841         mounts = nxt_realloc(mounts, nmounts);
842         if (nxt_slow_path(mounts == NULL)) {
843             goto fail;
844         }
845 
846         shared = nxt_realloc(shared, nmounts);
847         if (nxt_slow_path(shared == NULL)) {
848             goto fail;
849         }
850 
851         goto again;
852     }
853 
854     for (index = 0; index < nmounts; index++) {
855         if (nxt_strcmp(mounts[index], rootfs) == 0) {
856             parent_mnt = (char *) rootfs;
857             break;
858         }
859     }
860 
861     if (parent_mnt == NULL) {
862         len = nxt_strlen(rootfs);
863 
864         parent_mnt = nxt_malloc(len + 1);
865         if (parent_mnt == NULL) {
866             goto fail;
867         }
868 
869         nxt_memcpy(parent_mnt, rootfs, len);
870         parent_mnt[len] = '\0';
871 
872         if (parent_mnt[len - 1] == '/') {
873             parent_mnt[len - 1] = '\0';
874             len--;
875         }
876 
877         for ( ;; ) {
878             for (index = 0; index < nmounts; index++) {
879                 if (nxt_strcmp(mounts[index], parent_mnt) == 0) {
880                     goto found;
881                 }
882             }
883 
884             if (len == 1 && parent_mnt[0] == '/') {
885                 nxt_alert(task, "parent mount not found");
886                 goto fail;
887             }
888 
889             /* parent dir */
890             while (parent_mnt[len - 1] != '/' && len > 0) {
891                 len--;
892             }
893 
894             if (nxt_slow_path(len == 0)) {
895                 nxt_alert(task, "parent mount not found");
896                 goto fail;
897             }
898 
899             if (len == 1) {
900                 parent_mnt[len] = '\0';     /* / */
901             } else {
902                 parent_mnt[len - 1] = '\0'; /* /<path> */
903             }
904         }
905     }
906 
907 found:
908 
909     if (shared[index]) {
910         if (nxt_slow_path(mount("", parent_mnt, "", MS_PRIVATE, "") != 0)) {
911             nxt_alert(task, "mount(\"\", \"%s\", MS_PRIVATE) %E", parent_mnt,
912                       nxt_errno);
913 
914             goto fail;
915         }
916     }
917 
918     ret = NXT_OK;
919 
920 fail:
921 
922     if (procfile != NULL) {
923         endmntent(procfile);
924     }
925 
926     if (mounts != NULL) {
927         for (index = 0; index < nmounts; index++) {
928             nxt_free(mounts[index]);
929         }
930 
931         nxt_free(mounts);
932     }
933 
934     if (shared != NULL) {
935         nxt_free(shared);
936     }
937 
938     if (parent_mnt != NULL && parent_mnt != rootfs) {
939         nxt_free(parent_mnt);
940     }
941 
942     return ret;
943 }
944 
945 
946 nxt_inline int
947 nxt_pivot_root(const char *new_root, const char *old_root)
948 {
949     return syscall(__NR_pivot_root, new_root, old_root);
950 }
951 
952 
953 #else /* !(NXT_HAVE_PIVOT_ROOT) || !(NXT_HAVE_CLONE_NEWNS) */
954 
955 
956 nxt_int_t
957 nxt_isolation_change_root(nxt_task_t *task, nxt_process_t *process)
958 {
959     char       *rootfs;
960 
961     rootfs = (char *) process->isolation.rootfs;
962 
963     nxt_debug(task, "change root: %s", rootfs);
964 
965     if (nxt_fast_path(nxt_isolation_chroot(task, rootfs) == NXT_OK)) {
966         if (nxt_slow_path(chdir("/") < 0)) {
967             nxt_alert(task, "chdir(\"/\") %E", nxt_errno);
968             return NXT_ERROR;
969         }
970 
971         return NXT_OK;
972     }
973 
974     return NXT_ERROR;
975 }
976 
977 #endif
978 
979 
980 static nxt_int_t
981 nxt_isolation_chroot(nxt_task_t *task, const char *path)
982 {
983     if (nxt_slow_path(chroot(path) < 0)) {
984         nxt_alert(task, "chroot(%s) %E", path, nxt_errno);
985         return NXT_ERROR;
986     }
987 
988     return NXT_OK;
989 }
990 
991 #endif /* NXT_HAVE_ISOLATION_ROOTFS */
992 
993 
994 #if (NXT_HAVE_PR_SET_NO_NEW_PRIVS)
995 
996 static nxt_int_t
997 nxt_isolation_set_new_privs(nxt_task_t *task, nxt_conf_value_t *isolation,
998     nxt_process_t *process)
999 {
1000     nxt_conf_value_t  *obj;
1001 
1002     static nxt_str_t  new_privs_name = nxt_string("new_privs");
1003 
1004     obj = nxt_conf_get_object_member(isolation, &new_privs_name, NULL);
1005     if (obj != NULL) {
1006         process->isolation.new_privs = nxt_conf_get_boolean(obj);
1007     }
1008 
1009     return NXT_OK;
1010 }
1011 
1012 #endif
1013