Thursday, March 22, 2012

Patch linux kernel for add ability to send an event about do_fork()/do_exit()

Recently I have interesting question - how to create event about do_fork()/do_exit of group of processes, not any process but some of them. It's very useful, for example, when one want to create group of persistent processes (IDS/IPS), but sometimes software "fall" =) and system must restart it. So we can make cron script for checking PIDs, we can create N pthreads and then in each thread exec() new process, when wait4() it (when it falls re-exec it). The first solution lead in persistent scanning (each t-seconds we must to check even nothing happened), second solution no so scalable, bcoz we need 2N + 1 working processes for N persistent processes. Of course, it's not load system, each wait4() thread sleep and doesn't eat much cpu time. And I had create small kernel patch based on cgroups subsystem. Shortly, we push persistent processes to a special group and set userspace event handler. When process move out from this cgroup (when process do_exit() it's move out too) we catch event and restart this process.


Let's go. Investigate kernel/cgroup.c
void cgroup_exit(struct task_struct *tsk, int run_callbacks)
{
 struct css_set *cg;
 int i;

 /*
  * Unlink from the css_set task list if necessary.
  * Optimistically check cg_list before taking
  * css_set_lock
  */
 if (!list_empty(&tsk->cg_list)) {
  write_lock(&css_set_lock);
  if (!list_empty(&tsk->cg_list))
   list_del_init(&tsk->cg_list);
  write_unlock(&css_set_lock);
 }

 /* Reassign the task to the init_css_set. */
 task_lock(tsk);
 cg = tsk->cgroups;
 tsk->cgroups = &init_css_set;

We interesting in the tsk->cgroups, bcoz it points to the current css_set for for this. Structure css_set  has point to all mounted cgroups.

Ok, in the next step we must create event list handler when userspace want to get events about do_fork()/do_exit. You can investigate it in the mm/memcontrol.c, search for thresholds events. From memcontrol.c we can understand how we register events:
 {
  .name = "oom_control",
  .read_map = mem_cgroup_oom_control_read,
  .write_u64 = mem_cgroup_oom_control_write,
  .register_event = mem_cgroup_oom_register_event,
  .unregister_event = mem_cgroup_oom_unregister_event,
  .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
 },

callback register_event() calls when user open eventfd from userspace. Sample of the userspace tool you can find here Documentation/cgroups/cgroup_event_listener.c in the kernel sources.

Let's look on the mem_cgroup_usage_register_event().
static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
{
 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 struct mem_cgroup_eventfd_list *event;
 int type = MEMFILE_TYPE(cft->private);

 BUG_ON(type != _OOM_TYPE);
 event = kmalloc(sizeof(*event), GFP_KERNEL);
 if (!event)
  return -ENOMEM;

 spin_lock(&memcg_oom_lock);

 event->eventfd = eventfd;
 list_add(&event->list, &memcg->oom_notify);
...
 spin_unlock(&memcg_oom_lock);

 

Not so hard,  just get eventfd, allocate space for list member and add it to the list.  Ok, we are ready for create our own event for do_fork()/do_exit. But wait a second, where we must patch? Not so hard to understand - tasks file in the cgroup directory. Bcoz all PID movements in a cgroup shown in the tasks file. Ok we are ready for the patch.

At first we msust create struct for store events fe_eventfd_list and a spinlock for guard this. Add list and spinlock to the cgroup struct
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -192,6 +192,12 @@ struct cgroup_pidlist {
  struct rw_semaphore mutex;
 };
 
+struct fe_eventfd_list {
+ struct list_head list;
+ struct eventfd_ctx *eventfd;
+};
+
 struct cgroup {
  unsigned long flags;  /* "unsigned long" so bitops work */
 
@@ -243,6 +249,10 @@ struct cgroup {
  /* List of events which userspace want to receive */
  struct list_head event_list;
  spinlock_t event_list_lock;
+
+ /* fork-exit event */
+ struct list_head fe_notify;
+ spinlock_t fe_list_lock;
 };

Next step is add list init. But let's remember, init process fork too, so we must init list at system start time
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1336,6 +1336,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
  INIT_LIST_HEAD(&cgrp->css_sets);
  INIT_LIST_HEAD(&cgrp->release_list);
  INIT_LIST_HEAD(&cgrp->pidlists);
+ INIT_LIST_HEAD(&cgrp->fe_notify);
+ spin_lock_init(&cgrp->fe_list_lock);
  mutex_init(&cgrp->pidlist_mutex);
  INIT_LIST_HEAD(&cgrp->event_list);
  spin_lock_init(&cgrp->event_list_lock);

And now we can add callbacks for register event for "tasks" file
+static int tasks_register_event(struct cgroup *cgrp,
+ struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
+{
+ struct fe_eventfd_list *ev;
+
+ ev = kmalloc(sizeof(*ev), GFP_KERNEL);
+ if(!ev)
+  return -ENOMEM;
+
+ spin_lock(&cgrp->fe_list_lock);
+ ev->eventfd = eventfd;
+ list_add(&ev->list, &cgrp->fe_notify);
+ spin_unlock(&cgrp->fe_list_lock);
+
+ return 0;
+}
+
+static void tasks_unregister_event(struct cgroup *cgrp,
+ struct cftype *cft, struct eventfd_ctx *eventfd)
+{
+ struct fe_eventfd_list *ev, *tmp;
+
+ spin_lock(&cgrp->fe_list_lock);
+ list_for_each_entry_safe(ev, tmp, &cgrp->fe_notify, list) {
+  if (ev->eventfd == eventfd) {
+   list_del(&ev->list);
+   kfree(ev);
+  }
+ }
+ spin_unlock(&cgrp->fe_list_lock);
+}
+
 /*
  * for the common functions, 'private' gives the type of file
  */
@@ -3670,6 +3704,8 @@ static struct cftype files[] = {
   .open = cgroup_tasks_open,
   .write_u64 = cgroup_tasks_write,
   .release = cgroup_pidlist_release,
+  .register_event = tasks_register_event,
+  .unregister_event = tasks_unregister_event,
   .mode = S_IRUGO | S_IWUSR,
  },

This addons are very similar to memcontrol.c "oom_control" handlers (look above). In the last part of patch we send event for every cgroup about changing state
@@ -4558,6 +4594,22 @@ void cgroup_fork(struct task_struct *child)
  child->cgroups = current->cgroups;
  get_css_set(child->cgroups);
  INIT_LIST_HEAD(&child->cg_list);
+
+ struct cgroupfs_root *root;
+
+ /* send event to the userspace */
+ mutex_lock(&cgroup_mutex);
+ for_each_active_root(root) {
+  struct cgroup *cgrp;
+  struct fe_eventfd_list *ev;
+
+  cgrp = task_cgroup_from_root(child, root);
+
+  list_for_each_entry(ev, &cgrp->fe_notify, list) {
+   eventfd_signal(ev->eventfd, 1);
+  }
+ }
+ mutex_unlock(&cgroup_mutex);
 }
 
 /**
@@ -4653,6 +4705,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 {
  struct css_set *cg;
  int i;
+ struct cgroupfs_root *root;
 
  /*
   * Unlink from the css_set task list if necessary.
@@ -4666,6 +4719,20 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
   write_unlock(&css_set_lock);
  }
 
+ /* send event to the userspace */
+ mutex_lock(&cgroup_mutex);
+ for_each_active_root(root) {
+  struct cgroup *cgrp;
+  struct fe_eventfd_list *ev;
+
+  cgrp = task_cgroup_from_root(tsk, root);
+
+  list_for_each_entry(ev, &cgrp->fe_notify, list) {
+   eventfd_signal(ev->eventfd, 1);
+  }
+ }
+ mutex_unlock(&cgroup_mutex);
+

It's interesting part. For sometime I think how I can send event about PID for every cgroup connected with this PID. Answer was founed in the proc_cgroup_show() function. In the function we iterate over every cgroup attached to the PID and show it in the procfs.

Complete patch availeble here. It's against c38e23456278e967f094b08247ffc3711b1029b2 (i387: fix sense of sanity check). And client code with start and test scripts also avalible in the my github too.




No comments:

Post a Comment