1 /*
2 * Copyright (C) NGINX, Inc.
3 */
4
5 #include <nxt_main.h>
6 #include <nxt_application.h>
7 #include <nxt_process.h>
8 #include <nxt_isolation.h>
9 #include <nxt_cgroup.h>
10
11 #if (NXT_HAVE_MNTENT_H)
12 #include <mntent.h>
13 #endif
14
15
16 static nxt_int_t nxt_isolation_set(nxt_task_t *task,
17 nxt_conf_value_t *isolation, nxt_process_t *process);
18
19 #if (NXT_HAVE_CGROUP)
20 static nxt_int_t nxt_isolation_set_cgroup(nxt_task_t *task,
21 nxt_conf_value_t *isolation, nxt_process_t *process);
22 #endif
23
24 #if (NXT_HAVE_LINUX_NS)
25 static nxt_int_t nxt_isolation_set_namespaces(nxt_task_t *task,
26 nxt_conf_value_t *isolation, nxt_process_t *process);
27 static nxt_int_t nxt_isolation_clone_flags(nxt_task_t *task,
28 nxt_conf_value_t *namespaces, nxt_clone_t *clone);
29 #endif
30
31 #if (NXT_HAVE_CLONE_NEWUSER)
32 static nxt_int_t nxt_isolation_set_creds(nxt_task_t *task,
33 nxt_conf_value_t *isolation, nxt_process_t *process);
34 static nxt_int_t nxt_isolation_credential_map(nxt_task_t *task,
35 nxt_mp_t *mem_pool, nxt_conf_value_t *map_array,
36 nxt_clone_credential_map_t *map);
37 static nxt_int_t nxt_isolation_vldt_creds(nxt_task_t *task,
38 nxt_process_t *process);
39 #endif
40
41 #if (NXT_HAVE_ISOLATION_ROOTFS)
42 static nxt_int_t nxt_isolation_set_rootfs(nxt_task_t *task,
43 nxt_conf_value_t *isolation, nxt_process_t *process);
44 static nxt_int_t nxt_isolation_set_automount(nxt_task_t *task,
45 nxt_conf_value_t *isolation, nxt_process_t *process);
46 static nxt_int_t nxt_isolation_set_mounts(nxt_task_t *task,
47 nxt_process_t *process, nxt_str_t *app_type);
48 static nxt_int_t nxt_isolation_set_lang_mounts(nxt_task_t *task,
49 nxt_process_t *process, nxt_array_t *syspaths);
50 static int nxt_cdecl nxt_isolation_mount_compare(const void *v1,
51 const void *v2);
52 static void nxt_isolation_unmount_all(nxt_task_t *task, nxt_process_t *process);
53
54 #if (NXT_HAVE_LINUX_PIVOT_ROOT) && (NXT_HAVE_CLONE_NEWNS)
55 static nxt_int_t nxt_isolation_pivot_root(nxt_task_t *task, const char *rootfs);
56 static nxt_int_t nxt_isolation_make_private_mount(nxt_task_t *task,
57 const char *rootfs);
58 nxt_inline int nxt_pivot_root(const char *new_root, const char *old_root);
59 #endif
60
61 static nxt_int_t nxt_isolation_chroot(nxt_task_t *task, const char *path);
62 #endif
63
64 #if (NXT_HAVE_PR_SET_NO_NEW_PRIVS)
65 static nxt_int_t nxt_isolation_set_new_privs(nxt_task_t *task,
66 nxt_conf_value_t *isolation, nxt_process_t *process);
67 #endif
68
69
70 nxt_int_t
nxt_isolation_main_prefork(nxt_task_t * task,nxt_process_t * process,nxt_mp_t * mp)71 nxt_isolation_main_prefork(nxt_task_t *task, nxt_process_t *process,
72 nxt_mp_t *mp)
73 {
74 nxt_int_t cap_setid;
75 nxt_int_t ret;
76 nxt_runtime_t *rt;
77 nxt_common_app_conf_t *app_conf;
78
79 rt = task->thread->runtime;
80 app_conf = process->data.app;
81 cap_setid = rt->capabilities.setid;
82
83 #if (NXT_HAVE_PR_SET_NO_NEW_PRIVS)
84 process->isolation.new_privs = 1;
85 #endif
86
87 if (app_conf->isolation != NULL) {
88 ret = nxt_isolation_set(task, app_conf->isolation, process);
89 if (nxt_slow_path(ret != NXT_OK)) {
90 return ret;
91 }
92 }
93
94 #if (NXT_HAVE_CLONE_NEWUSER)
95 if (nxt_is_clone_flag_set(process->isolation.clone.flags, NEWUSER)) {
96 cap_setid = 1;
97 }
98 #endif
99
100 if (cap_setid) {
101 ret = nxt_process_creds_set(task, process, &app_conf->user,
102 &app_conf->group);
103
104 if (nxt_slow_path(ret != NXT_OK)) {
105 return ret;
106 }
107
108 } else {
109 if (!nxt_str_eq(&app_conf->user, (u_char *) rt->user_cred.user,
110 nxt_strlen(rt->user_cred.user)))
111 {
112 nxt_alert(task, "cannot set user \"%V\" for app \"%V\": "
113 "missing capabilities", &app_conf->user, &app_conf->name);
114
115 return NXT_ERROR;
116 }
117
118 if (app_conf->group.length > 0
119 && !nxt_str_eq(&app_conf->group, (u_char *) rt->group,
120 nxt_strlen(rt->group)))
121 {
122 nxt_alert(task, "cannot set group \"%V\" for app \"%V\": "
123 "missing capabilities", &app_conf->group,
124 &app_conf->name);
125
126 return NXT_ERROR;
127 }
128 }
129
130 #if (NXT_HAVE_ISOLATION_ROOTFS)
131 if (process->isolation.rootfs != NULL) {
132 nxt_int_t has_mnt;
133
134 ret = nxt_isolation_set_mounts(task, process, &app_conf->type);
135 if (nxt_slow_path(ret != NXT_OK)) {
136 return ret;
137 }
138
139 #if (NXT_HAVE_CLONE_NEWNS)
140 has_mnt = nxt_is_clone_flag_set(process->isolation.clone.flags, NEWNS);
141 #else
142 has_mnt = 0;
143 #endif
144
145 if (process->user_cred->uid == 0 && !has_mnt) {
146 nxt_log(task, NXT_LOG_WARN,
147 "setting user \"root\" with \"rootfs\" is unsafe without "
148 "\"mount\" namespace isolation");
149 }
150 }
151 #endif
152
153 #if (NXT_HAVE_CLONE_NEWUSER)
154 ret = nxt_isolation_vldt_creds(task, process);
155 if (nxt_slow_path(ret != NXT_OK)) {
156 return ret;
157 }
158 #endif
159
160 return NXT_OK;
161 }
162
163
164 static nxt_int_t
nxt_isolation_set(nxt_task_t * task,nxt_conf_value_t * isolation,nxt_process_t * process)165 nxt_isolation_set(nxt_task_t *task, nxt_conf_value_t *isolation,
166 nxt_process_t *process)
167 {
168 #if (NXT_HAVE_CGROUP)
169 if (nxt_slow_path(nxt_isolation_set_cgroup(task, isolation, process)
170 != NXT_OK))
171 {
172 return NXT_ERROR;
173 }
174 #endif
175
176 #if (NXT_HAVE_LINUX_NS)
177 if (nxt_slow_path(nxt_isolation_set_namespaces(task, isolation, process)
178 != NXT_OK))
179 {
180 return NXT_ERROR;
181 }
182 #endif
183
184 #if (NXT_HAVE_CLONE_NEWUSER)
185 if (nxt_slow_path(nxt_isolation_set_creds(task, isolation, process)
186 != NXT_OK))
187 {
188 return NXT_ERROR;
189 }
190 #endif
191
192 #if (NXT_HAVE_ISOLATION_ROOTFS)
193 if (nxt_slow_path(nxt_isolation_set_rootfs(task, isolation, process)
194 != NXT_OK))
195 {
196 return NXT_ERROR;
197 }
198
199 if (nxt_slow_path(nxt_isolation_set_automount(task, isolation, process)
200 != NXT_OK))
201 {
202 return NXT_ERROR;
203 }
204 #endif
205
206 #if (NXT_HAVE_PR_SET_NO_NEW_PRIVS)
207 if (nxt_slow_path(nxt_isolation_set_new_privs(task, isolation, process)
208 != NXT_OK))
209 {
210 return NXT_ERROR;
211 }
212 #endif
213
214 return NXT_OK;
215 }
216
217
218 #if (NXT_HAVE_CGROUP)
219
220 static nxt_int_t
nxt_isolation_set_cgroup(nxt_task_t * task,nxt_conf_value_t * isolation,nxt_process_t * process)221 nxt_isolation_set_cgroup(nxt_task_t *task, nxt_conf_value_t *isolation,
222 nxt_process_t *process)
223 {
224 nxt_str_t str;
225 nxt_conf_value_t *obj;
226
227 static nxt_str_t cgname = nxt_string("cgroup");
228 static nxt_str_t path = nxt_string("path");
229
230 obj = nxt_conf_get_object_member(isolation, &cgname, NULL);
231 if (obj == NULL) {
232 return NXT_OK;
233 }
234
235 obj = nxt_conf_get_object_member(obj, &path, NULL);
236 if (obj == NULL) {
237 return NXT_ERROR;
238 }
239
240 nxt_conf_get_string(obj, &str);
241 process->isolation.cgroup.path = nxt_mp_alloc(process->mem_pool,
242 str.length + 1);
243 nxt_memcpy(process->isolation.cgroup.path, str.start, str.length);
244 process->isolation.cgroup.path[str.length] = '\0';
245
246 process->isolation.cgroup_cleanup = nxt_cgroup_cleanup;
247
248 return NXT_OK;
249 }
250
251 #endif
252
253
254 #if (NXT_HAVE_LINUX_NS)
255
256 static nxt_int_t
nxt_isolation_set_namespaces(nxt_task_t * task,nxt_conf_value_t * isolation,nxt_process_t * process)257 nxt_isolation_set_namespaces(nxt_task_t *task, nxt_conf_value_t *isolation,
258 nxt_process_t *process)
259 {
260 nxt_int_t ret;
261 nxt_conf_value_t *obj;
262
263 static nxt_str_t nsname = nxt_string("namespaces");
264
265 obj = nxt_conf_get_object_member(isolation, &nsname, NULL);
266 if (obj != NULL) {
267 ret = nxt_isolation_clone_flags(task, obj, &process->isolation.clone);
268 if (nxt_slow_path(ret != NXT_OK)) {
269 return NXT_ERROR;
270 }
271 }
272
273 return NXT_OK;
274 }
275
276 #endif
277
278
279 #if (NXT_HAVE_CLONE_NEWUSER)
280
281 static nxt_int_t
nxt_isolation_set_creds(nxt_task_t * task,nxt_conf_value_t * isolation,nxt_process_t * process)282 nxt_isolation_set_creds(nxt_task_t *task, nxt_conf_value_t *isolation,
283 nxt_process_t *process)
284 {
285 nxt_int_t ret;
286 nxt_clone_t *clone;
287 nxt_conf_value_t *array;
288
289 static nxt_str_t uidname = nxt_string("uidmap");
290 static nxt_str_t gidname = nxt_string("gidmap");
291
292 clone = &process->isolation.clone;
293
294 array = nxt_conf_get_object_member(isolation, &uidname, NULL);
295 if (array != NULL) {
296 ret = nxt_isolation_credential_map(task, process->mem_pool, array,
297 &clone->uidmap);
298
299 if (nxt_slow_path(ret != NXT_OK)) {
300 return NXT_ERROR;
301 }
302 }
303
304 array = nxt_conf_get_object_member(isolation, &gidname, NULL);
305 if (array != NULL) {
306 ret = nxt_isolation_credential_map(task, process->mem_pool, array,
307 &clone->gidmap);
308
309 if (nxt_slow_path(ret != NXT_OK)) {
310 return NXT_ERROR;
311 }
312 }
313
314 return NXT_OK;
315 }
316
317
318 static nxt_int_t
nxt_isolation_credential_map(nxt_task_t * task,nxt_mp_t * mp,nxt_conf_value_t * map_array,nxt_clone_credential_map_t * map)319 nxt_isolation_credential_map(nxt_task_t *task, nxt_mp_t *mp,
320 nxt_conf_value_t *map_array, nxt_clone_credential_map_t *map)
321 {
322 nxt_int_t ret;
323 nxt_uint_t i;
324 nxt_conf_value_t *obj;
325
326 static nxt_conf_map_t nxt_clone_map_entry_conf[] = {
327 {
328 nxt_string("container"),
329 NXT_CONF_MAP_INT,
330 offsetof(nxt_clone_map_entry_t, container),
331 },
332
333 {
334 nxt_string("host"),
335 NXT_CONF_MAP_INT,
336 offsetof(nxt_clone_map_entry_t, host),
337 },
338
339 {
340 nxt_string("size"),
341 NXT_CONF_MAP_INT,
342 offsetof(nxt_clone_map_entry_t, size),
343 },
344 };
345
346 map->size = nxt_conf_array_elements_count(map_array);
347
348 if (map->size == 0) {
349 return NXT_OK;
350 }
351
352 map->map = nxt_mp_alloc(mp, map->size * sizeof(nxt_clone_map_entry_t));
353 if (nxt_slow_path(map->map == NULL)) {
354 return NXT_ERROR;
355 }
356
357 for (i = 0; i < map->size; i++) {
358 obj = nxt_conf_get_array_element(map_array, i);
359
360 ret = nxt_conf_map_object(mp, obj, nxt_clone_map_entry_conf,
361 nxt_nitems(nxt_clone_map_entry_conf),
362 map->map + i);
363 if (nxt_slow_path(ret != NXT_OK)) {
364 nxt_alert(task, "clone map entry map error");
365 return NXT_ERROR;
366 }
367 }
368
369 return NXT_OK;
370 }
371
372
373 static nxt_int_t
nxt_isolation_vldt_creds(nxt_task_t * task,nxt_process_t * process)374 nxt_isolation_vldt_creds(nxt_task_t *task, nxt_process_t *process)
375 {
376 nxt_int_t ret;
377 nxt_clone_t *clone;
378 nxt_credential_t *creds;
379
380 clone = &process->isolation.clone;
381 creds = process->user_cred;
382
383 if (clone->uidmap.size == 0 && clone->gidmap.size == 0) {
384 return NXT_OK;
385 }
386
387 if (!nxt_is_clone_flag_set(clone->flags, NEWUSER)) {
388 if (nxt_slow_path(clone->uidmap.size > 0)) {
389 nxt_log(task, NXT_LOG_ERR, "\"uidmap\" is set but "
390 "\"isolation.namespaces.credential\" is false or unset");
391
392 return NXT_ERROR;
393 }
394
395 if (nxt_slow_path(clone->gidmap.size > 0)) {
396 nxt_log(task, NXT_LOG_ERR, "\"gidmap\" is set but "
397 "\"isolation.namespaces.credential\" is false or unset");
398
399 return NXT_ERROR;
400 }
401
402 return NXT_OK;
403 }
404
405 ret = nxt_clone_vldt_credential_uidmap(task, &clone->uidmap, creds);
406 if (nxt_slow_path(ret != NXT_OK)) {
407 return NXT_ERROR;
408 }
409
410 return nxt_clone_vldt_credential_gidmap(task, &clone->gidmap, creds);
411 }
412
413 #endif
414
415
416 #if (NXT_HAVE_LINUX_NS)
417
418 static nxt_int_t
nxt_isolation_clone_flags(nxt_task_t * task,nxt_conf_value_t * namespaces,nxt_clone_t * clone)419 nxt_isolation_clone_flags(nxt_task_t *task, nxt_conf_value_t *namespaces,
420 nxt_clone_t *clone)
421 {
422 uint32_t index;
423 nxt_str_t name;
424 nxt_int_t flag;
425 nxt_conf_value_t *value;
426
427 index = 0;
428
429 for ( ;; ) {
430 value = nxt_conf_next_object_member(namespaces, &name, &index);
431
432 if (value == NULL) {
433 break;
434 }
435
436 flag = 0;
437
438 #if (NXT_HAVE_CLONE_NEWUSER)
439 if (nxt_str_eq(&name, "credential", 10)) {
440 flag = CLONE_NEWUSER;
441 }
442 #endif
443
444 #if (NXT_HAVE_CLONE_NEWPID)
445 if (nxt_str_eq(&name, "pid", 3)) {
446 flag = CLONE_NEWPID;
447 }
448 #endif
449
450 #if (NXT_HAVE_CLONE_NEWNET)
451 if (nxt_str_eq(&name, "network", 7)) {
452 flag = CLONE_NEWNET;
453 }
454 #endif
455
456 #if (NXT_HAVE_CLONE_NEWUTS)
457 if (nxt_str_eq(&name, "uname", 5)) {
458 flag = CLONE_NEWUTS;
459 }
460 #endif
461
462 #if (NXT_HAVE_CLONE_NEWNS)
463 if (nxt_str_eq(&name, "mount", 5)) {
464 flag = CLONE_NEWNS;
465 }
466 #endif
467
468 #if (NXT_HAVE_CLONE_NEWCGROUP)
469 if (nxt_str_eq(&name, "cgroup", 6)) {
470 flag = CLONE_NEWCGROUP;
471 }
472 #endif
473
474 if (!flag) {
475 nxt_alert(task, "unknown namespace flag: \"%V\"", &name);
476 return NXT_ERROR;
477 }
478
479 if (nxt_conf_get_boolean(value)) {
480 clone->flags |= flag;
481 }
482 }
483
484 return NXT_OK;
485 }
486
487 #endif
488
489
490 #if (NXT_HAVE_ISOLATION_ROOTFS)
491
492 static nxt_int_t
nxt_isolation_set_rootfs(nxt_task_t * task,nxt_conf_value_t * isolation,nxt_process_t * process)493 nxt_isolation_set_rootfs(nxt_task_t *task, nxt_conf_value_t *isolation,
494 nxt_process_t *process)
495 {
496 nxt_str_t str;
497 nxt_conf_value_t *obj;
498
499 static nxt_str_t rootfs_name = nxt_string("rootfs");
500
501 obj = nxt_conf_get_object_member(isolation, &rootfs_name, NULL);
502 if (obj != NULL) {
503 nxt_conf_get_string(obj, &str);
504
505 if (nxt_slow_path(str.length <= 1 || str.start[0] != '/')) {
506 nxt_log(task, NXT_LOG_ERR, "rootfs requires an absolute path other "
507 "than \"/\" but given \"%V\"", &str);
508
509 return NXT_ERROR;
510 }
511
512 if (str.start[str.length - 1] == '/') {
513 str.length--;
514 }
515
516 process->isolation.rootfs = nxt_mp_alloc(process->mem_pool,
517 str.length + 1);
518
519 if (nxt_slow_path(process->isolation.rootfs == NULL)) {
520 return NXT_ERROR;
521 }
522
523 nxt_memcpy(process->isolation.rootfs, str.start, str.length);
524
525 process->isolation.rootfs[str.length] = '\0';
526 }
527
528 return NXT_OK;
529 }
530
531
532 static nxt_int_t
nxt_isolation_set_automount(nxt_task_t * task,nxt_conf_value_t * isolation,nxt_process_t * process)533 nxt_isolation_set_automount(nxt_task_t *task, nxt_conf_value_t *isolation,
534 nxt_process_t *process)
535 {
536 nxt_conf_value_t *conf, *value;
537 nxt_process_automount_t *automount;
538
539 static nxt_str_t automount_name = nxt_string("automount");
540 static nxt_str_t langdeps_name = nxt_string("language_deps");
541 static nxt_str_t tmp_name = nxt_string("tmpfs");
542 static nxt_str_t proc_name = nxt_string("procfs");
543
544 automount = &process->isolation.automount;
545
546 automount->language_deps = 1;
547 automount->tmpfs = 1;
548 automount->procfs = 1;
549
550 conf = nxt_conf_get_object_member(isolation, &automount_name, NULL);
551 if (conf != NULL) {
552 value = nxt_conf_get_object_member(conf, &langdeps_name, NULL);
553 if (value != NULL) {
554 automount->language_deps = nxt_conf_get_boolean(value);
555 }
556
557 value = nxt_conf_get_object_member(conf, &tmp_name, NULL);
558 if (value != NULL) {
559 automount->tmpfs = nxt_conf_get_boolean(value);
560 }
561
562 value = nxt_conf_get_object_member(conf, &proc_name, NULL);
563 if (value != NULL) {
564 automount->procfs = nxt_conf_get_boolean(value);
565 }
566 }
567
568 return NXT_OK;
569 }
570
571
572 static nxt_int_t
nxt_isolation_set_mounts(nxt_task_t * task,nxt_process_t * process,nxt_str_t * app_type)573 nxt_isolation_set_mounts(nxt_task_t *task, nxt_process_t *process,
574 nxt_str_t *app_type)
575 {
576 nxt_int_t ret, cap_chroot;
577 nxt_runtime_t *rt;
578 nxt_app_lang_module_t *lang;
579
580 rt = task->thread->runtime;
581 cap_chroot = rt->capabilities.chroot;
582 lang = nxt_app_lang_module(rt, app_type);
583
584 nxt_assert(lang != NULL);
585
586 #if (NXT_HAVE_CLONE_NEWUSER)
587 if (nxt_is_clone_flag_set(process->isolation.clone.flags, NEWUSER)) {
588 cap_chroot = 1;
589 }
590 #endif
591
592 if (!cap_chroot) {
593 nxt_log(task, NXT_LOG_ERR, "The \"rootfs\" field requires privileges");
594 return NXT_ERROR;
595 }
596
597 ret = nxt_isolation_set_lang_mounts(task, process, lang->mounts);
598 if (nxt_slow_path(ret != NXT_OK)) {
599 return NXT_ERROR;
600 }
601
602 process->isolation.cleanup = nxt_isolation_unmount_all;
603
604 return NXT_OK;
605 }
606
607
608 static nxt_int_t
nxt_isolation_set_lang_mounts(nxt_task_t * task,nxt_process_t * process,nxt_array_t * lang_mounts)609 nxt_isolation_set_lang_mounts(nxt_task_t *task, nxt_process_t *process,
610 nxt_array_t *lang_mounts)
611 {
612 u_char *p;
613 size_t i, n, rootfs_len, len;
614 nxt_mp_t *mp;
615 nxt_array_t *mounts;
616 const u_char *rootfs;
617 nxt_fs_mount_t *mnt, *lang_mnt;
618
619 mp = process->mem_pool;
620
621 /* copy to init mem pool */
622 mounts = nxt_array_copy(mp, NULL, lang_mounts);
623 if (mounts == NULL) {
624 return NXT_ERROR;
625 }
626
627 n = mounts->nelts;
628 mnt = mounts->elts;
629 lang_mnt = lang_mounts->elts;
630
631 rootfs = process->isolation.rootfs;
632 rootfs_len = nxt_strlen(rootfs);
633
634 for (i = 0; i < n; i++) {
635 len = nxt_strlen(lang_mnt[i].dst);
636
637 mnt[i].dst = nxt_mp_alloc(mp, rootfs_len + len + 1);
638 if (nxt_slow_path(mnt[i].dst == NULL)) {
639 return NXT_ERROR;
640 }
641
642 p = nxt_cpymem(mnt[i].dst, rootfs, rootfs_len);
643 p = nxt_cpymem(p, lang_mnt[i].dst, len);
644 *p = '\0';
645 }
646
647 if (process->isolation.automount.tmpfs) {
648 mnt = nxt_array_add(mounts);
649 if (nxt_slow_path(mnt == NULL)) {
650 return NXT_ERROR;
651 }
652
653 mnt->src = (u_char *) "tmpfs";
654 mnt->name = (u_char *) "tmpfs";
655 mnt->type = NXT_FS_TMP;
656 mnt->flags = (NXT_FS_FLAGS_NOSUID
657 | NXT_FS_FLAGS_NODEV
658 | NXT_FS_FLAGS_NOEXEC);
659 mnt->data = (u_char *) "size=1m,mode=1777";
660 mnt->builtin = 1;
661 mnt->deps = 0;
662
663 mnt->dst = nxt_mp_nget(mp, rootfs_len + nxt_length("/tmp") + 1);
664 if (nxt_slow_path(mnt->dst == NULL)) {
665 return NXT_ERROR;
666 }
667
668 p = nxt_cpymem(mnt->dst, rootfs, rootfs_len);
669 p = nxt_cpymem(p, "/tmp", 4);
670 *p = '\0';
671 }
672
673 if (process->isolation.automount.procfs) {
674 mnt = nxt_array_add(mounts);
675 if (nxt_slow_path(mnt == NULL)) {
676 return NXT_ERROR;
677 }
678
679 mnt->name = (u_char *) "proc";
680 mnt->type = NXT_FS_PROC;
681 mnt->src = (u_char *) "none";
682 mnt->dst = nxt_mp_nget(mp, rootfs_len + nxt_length("/proc") + 1);
683 if (nxt_slow_path(mnt->dst == NULL)) {
684 return NXT_ERROR;
685 }
686
687 p = nxt_cpymem(mnt->dst, rootfs, rootfs_len);
688 p = nxt_cpymem(p, "/proc", 5);
689 *p = '\0';
690
691 mnt->data = (u_char *) "";
692 mnt->flags = NXT_FS_FLAGS_NOEXEC | NXT_FS_FLAGS_NOSUID;
693 mnt->builtin = 1;
694 mnt->deps = 0;
695 }
696
697 qsort(mounts->elts, mounts->nelts, sizeof(nxt_fs_mount_t),
698 nxt_isolation_mount_compare);
699
700 process->isolation.mounts = mounts;
701
702 return NXT_OK;
703 }
704
705
706 static int nxt_cdecl
nxt_isolation_mount_compare(const void * v1,const void * v2)707 nxt_isolation_mount_compare(const void *v1, const void *v2)
708 {
709 const nxt_fs_mount_t *mnt1, *mnt2;
710
711 mnt1 = v1;
712 mnt2 = v2;
713
714 return nxt_strlen(mnt1->src) > nxt_strlen(mnt2->src);
715 }
716
717
718 void
nxt_isolation_unmount_all(nxt_task_t * task,nxt_process_t * process)719 nxt_isolation_unmount_all(nxt_task_t *task, nxt_process_t *process)
720 {
721 size_t n;
722 nxt_array_t *mounts;
723 nxt_runtime_t *rt;
724 nxt_fs_mount_t *mnt;
725 nxt_process_automount_t *automount;
726
727 rt = task->thread->runtime;
728
729 if (!rt->capabilities.setid) {
730 return;
731 }
732
733 nxt_debug(task, "unmount all (%s)", process->name);
734
735 automount = &process->isolation.automount;
736 mounts = process->isolation.mounts;
737 n = mounts->nelts;
738 mnt = mounts->elts;
739
740 while (n > 0) {
741 n--;
742
743 if (mnt[n].deps && !automount->language_deps) {
744 continue;
745 }
746
747 nxt_fs_unmount(mnt[n].dst);
748 }
749 }
750
751
752 nxt_int_t
nxt_isolation_prepare_rootfs(nxt_task_t * task,nxt_process_t * process)753 nxt_isolation_prepare_rootfs(nxt_task_t *task, nxt_process_t *process)
754 {
755 size_t i, n;
756 nxt_int_t ret;
757 struct stat st;
758 nxt_array_t *mounts;
759 const u_char *dst;
760 nxt_fs_mount_t *mnt;
761 nxt_process_automount_t *automount;
762
763 automount = &process->isolation.automount;
764 mounts = process->isolation.mounts;
765
766 n = mounts->nelts;
767 mnt = mounts->elts;
768
769 for (i = 0; i < n; i++) {
770 dst = mnt[i].dst;
771
772 if (mnt[i].deps && !automount->language_deps) {
773 continue;
774 }
775
776 if (nxt_slow_path(mnt[i].type == NXT_FS_BIND
777 && stat((const char *) mnt[i].src, &st) != 0))
778 {
779 nxt_log(task, NXT_LOG_WARN, "host path not found: %s", mnt[i].src);
780 continue;
781 }
782
783 ret = nxt_fs_mkdir_all(dst, S_IRWXU | S_IRWXG | S_IRWXO);
784 if (nxt_slow_path(ret != NXT_OK)) {
785 nxt_alert(task, "mkdir(%s) %E", dst, nxt_errno);
786 goto undo;
787 }
788
789 ret = nxt_fs_mount(task, &mnt[i]);
790 if (nxt_slow_path(ret != NXT_OK)) {
791 goto undo;
792 }
793 }
794
795 return NXT_OK;
796
797 undo:
798
799 n = i + 1;
800
801 for (i = 0; i < n; i++) {
802 nxt_fs_unmount(mnt[i].dst);
803 }
804
805 return NXT_ERROR;
806 }
807
808
809 #if (NXT_HAVE_LINUX_PIVOT_ROOT) && (NXT_HAVE_CLONE_NEWNS)
810
811 nxt_int_t
nxt_isolation_change_root(nxt_task_t * task,nxt_process_t * process)812 nxt_isolation_change_root(nxt_task_t *task, nxt_process_t *process)
813 {
814 char *rootfs;
815 nxt_int_t ret;
816
817 rootfs = (char *) process->isolation.rootfs;
818
819 nxt_debug(task, "change root: %s", rootfs);
820
821 if (nxt_is_clone_flag_set(process->isolation.clone.flags, NEWNS)) {
822 ret = nxt_isolation_pivot_root(task, rootfs);
823
824 } else {
825 ret = nxt_isolation_chroot(task, rootfs);
826 }
827
828 if (nxt_fast_path(ret == NXT_OK)) {
829 if (nxt_slow_path(chdir("/") < 0)) {
830 nxt_alert(task, "chdir(\"/\") %E", nxt_errno);
831 return NXT_ERROR;
832 }
833 }
834
835 return ret;
836 }
837
838
839 /*
840 * pivot_root(2) can only be safely used with containers, otherwise it can
841 * umount(2) the global root filesystem and screw up the machine.
842 */
843
844 static nxt_int_t
nxt_isolation_pivot_root(nxt_task_t * task,const char * path)845 nxt_isolation_pivot_root(nxt_task_t *task, const char *path)
846 {
847 /*
848 * This implementation makes use of a kernel trick that works for ages
849 * and now documented in Linux kernel 5.
850 * https://lore.kernel.org/linux-man/87r24piwhm.fsf@x220.int.ebiederm.org/T/
851 */
852
853 if (nxt_slow_path(mount("", "/", "", MS_SLAVE|MS_REC, "") != 0)) {
854 nxt_alert(task, "mount(\"/\", MS_SLAVE|MS_REC) failed: %E", nxt_errno);
855 return NXT_ERROR;
856 }
857
858 if (nxt_slow_path(nxt_isolation_make_private_mount(task, path) != NXT_OK)) {
859 return NXT_ERROR;
860 }
861
862 if (nxt_slow_path(mount(path, path, "bind", MS_BIND|MS_REC, "") != 0)) {
863 nxt_alert(task, "error bind mounting rootfs %E", nxt_errno);
864 return NXT_ERROR;
865 }
866
867 if (nxt_slow_path(chdir(path) != 0)) {
868 nxt_alert(task, "failed to chdir(%s) %E", path, nxt_errno);
869 return NXT_ERROR;
870 }
871
872 if (nxt_slow_path(nxt_pivot_root(".", ".") != 0)) {
873 nxt_alert(task, "failed to pivot_root %E", nxt_errno);
874 return NXT_ERROR;
875 }
876
877 /*
878 * Demote the oldroot mount to avoid unmounts getting propagated to
879 * the host.
880 */
881 if (nxt_slow_path(mount("", ".", "", MS_SLAVE | MS_REC, NULL) != 0)) {
882 nxt_alert(task, "failed to bind mount rootfs %E", nxt_errno);
883 return NXT_ERROR;
884 }
885
886 if (nxt_slow_path(umount2(".", MNT_DETACH) != 0)) {
887 nxt_alert(task, "failed to umount old root directory %E", nxt_errno);
888 return NXT_ERROR;
889 }
890
891 return NXT_OK;
892 }
893
894
895 static nxt_int_t
nxt_isolation_make_private_mount(nxt_task_t * task,const char * rootfs)896 nxt_isolation_make_private_mount(nxt_task_t *task, const char *rootfs)
897 {
898 char *parent_mnt;
899 FILE *procfile;
900 u_char **mounts;
901 size_t len;
902 uint8_t *shared;
903 nxt_int_t ret, index, nmounts;
904 struct mntent *ent;
905
906 static const char *mount_path = "/proc/self/mounts";
907
908 ret = NXT_ERROR;
909 ent = NULL;
910 shared = NULL;
911 procfile = NULL;
912 parent_mnt = NULL;
913
914 nmounts = 256;
915
916 mounts = nxt_malloc(nmounts * sizeof(uintptr_t));
917 if (nxt_slow_path(mounts == NULL)) {
918 goto fail;
919 }
920
921 shared = nxt_malloc(nmounts);
922 if (nxt_slow_path(shared == NULL)) {
923 goto fail;
924 }
925
926 procfile = setmntent(mount_path, "r");
927 if (nxt_slow_path(procfile == NULL)) {
928 nxt_alert(task, "failed to open %s %E", mount_path, nxt_errno);
929
930 goto fail;
931 }
932
933 index = 0;
934
935 again:
936
937 for ( ; index < nmounts; index++) {
938 ent = getmntent(procfile);
939 if (ent == NULL) {
940 nmounts = index;
941 break;
942 }
943
944 mounts[index] = (u_char *) strdup(ent->mnt_dir);
945 shared[index] = hasmntopt(ent, "shared") != NULL;
946 }
947
948 if (ent != NULL) {
949 /* there are still entries to be read */
950
951 nmounts *= 2;
952 mounts = nxt_realloc(mounts, nmounts);
953 if (nxt_slow_path(mounts == NULL)) {
954 goto fail;
955 }
956
957 shared = nxt_realloc(shared, nmounts);
958 if (nxt_slow_path(shared == NULL)) {
959 goto fail;
960 }
961
962 goto again;
963 }
964
965 for (index = 0; index < nmounts; index++) {
966 if (nxt_strcmp(mounts[index], rootfs) == 0) {
967 parent_mnt = (char *) rootfs;
968 break;
969 }
970 }
971
972 if (parent_mnt == NULL) {
973 len = nxt_strlen(rootfs);
974
975 parent_mnt = nxt_malloc(len + 1);
976 if (parent_mnt == NULL) {
977 goto fail;
978 }
979
980 nxt_memcpy(parent_mnt, rootfs, len);
981 parent_mnt[len] = '\0';
982
983 if (parent_mnt[len - 1] == '/') {
984 parent_mnt[len - 1] = '\0';
985 len--;
986 }
987
988 for ( ;; ) {
989 for (index = 0; index < nmounts; index++) {
990 if (nxt_strcmp(mounts[index], parent_mnt) == 0) {
991 goto found;
992 }
993 }
994
995 if (len == 1 && parent_mnt[0] == '/') {
996 nxt_alert(task, "parent mount not found");
997 goto fail;
998 }
999
1000 /* parent dir */
1001 while (parent_mnt[len - 1] != '/' && len > 0) {
1002 len--;
1003 }
1004
1005 if (nxt_slow_path(len == 0)) {
1006 nxt_alert(task, "parent mount not found");
1007 goto fail;
1008 }
1009
1010 if (len == 1) {
1011 parent_mnt[len] = '\0'; /* / */
1012 } else {
1013 parent_mnt[len - 1] = '\0'; /* /<path> */
1014 }
1015 }
1016 }
1017
1018 found:
1019
1020 if (shared[index]) {
1021 if (nxt_slow_path(mount("", parent_mnt, "", MS_PRIVATE, "") != 0)) {
1022 nxt_alert(task, "mount(\"\", \"%s\", MS_PRIVATE) %E", parent_mnt,
1023 nxt_errno);
1024
1025 goto fail;
1026 }
1027 }
1028
1029 ret = NXT_OK;
1030
1031 fail:
1032
1033 if (procfile != NULL) {
1034 endmntent(procfile);
1035 }
1036
1037 if (mounts != NULL) {
1038 for (index = 0; index < nmounts; index++) {
1039 nxt_free(mounts[index]);
1040 }
1041
1042 nxt_free(mounts);
1043 }
1044
1045 if (shared != NULL) {
1046 nxt_free(shared);
1047 }
1048
1049 if (parent_mnt != NULL && parent_mnt != rootfs) {
1050 nxt_free(parent_mnt);
1051 }
1052
1053 return ret;
1054 }
1055
1056
1057 nxt_inline int
nxt_pivot_root(const char * new_root,const char * old_root)1058 nxt_pivot_root(const char *new_root, const char *old_root)
1059 {
1060 return syscall(SYS_pivot_root, new_root, old_root);
1061 }
1062
1063
1064 #else /* !(NXT_HAVE_LINUX_PIVOT_ROOT) || !(NXT_HAVE_CLONE_NEWNS) */
1065
1066
1067 nxt_int_t
nxt_isolation_change_root(nxt_task_t * task,nxt_process_t * process)1068 nxt_isolation_change_root(nxt_task_t *task, nxt_process_t *process)
1069 {
1070 char *rootfs;
1071
1072 rootfs = (char *) process->isolation.rootfs;
1073
1074 nxt_debug(task, "change root: %s", rootfs);
1075
1076 if (nxt_fast_path(nxt_isolation_chroot(task, rootfs) == NXT_OK)) {
1077 if (nxt_slow_path(chdir("/") < 0)) {
1078 nxt_alert(task, "chdir(\"/\") %E", nxt_errno);
1079 return NXT_ERROR;
1080 }
1081
1082 return NXT_OK;
1083 }
1084
1085 return NXT_ERROR;
1086 }
1087
1088 #endif
1089
1090
1091 static nxt_int_t
nxt_isolation_chroot(nxt_task_t * task,const char * path)1092 nxt_isolation_chroot(nxt_task_t *task, const char *path)
1093 {
1094 if (nxt_slow_path(chroot(path) < 0)) {
1095 nxt_alert(task, "chroot(%s) %E", path, nxt_errno);
1096 return NXT_ERROR;
1097 }
1098
1099 return NXT_OK;
1100 }
1101
1102 #endif /* NXT_HAVE_ISOLATION_ROOTFS */
1103
1104
1105 #if (NXT_HAVE_PR_SET_NO_NEW_PRIVS)
1106
1107 static nxt_int_t
nxt_isolation_set_new_privs(nxt_task_t * task,nxt_conf_value_t * isolation,nxt_process_t * process)1108 nxt_isolation_set_new_privs(nxt_task_t *task, nxt_conf_value_t *isolation,
1109 nxt_process_t *process)
1110 {
1111 nxt_conf_value_t *obj;
1112
1113 static nxt_str_t new_privs_name = nxt_string("new_privs");
1114
1115 obj = nxt_conf_get_object_member(isolation, &new_privs_name, NULL);
1116 if (obj != NULL) {
1117 process->isolation.new_privs = nxt_conf_get_boolean(obj);
1118 }
1119
1120 return NXT_OK;
1121 }
1122
1123 #endif
1124