diff mbox

Crash on thread id wrap around

Message ID 1427373950-30952-1-git-send-email-palves@redhat.com
State New
Headers show

Commit Message

Pedro Alves March 26, 2015, 12:45 p.m. UTC
On GNU/Linux, if the target reuses the TID of a thread that GDB still
has in its list marked as THREAD_EXITED, GDB crashes, like:

 (gdb) continue
 Continuing.
 /home/pedro/gdb/mygit/src/gdb/thread.c:789: internal-error: set_running: Assertion `tp->state != THREAD_EXITED' failed.
 A problem internal to GDB has been detected,
 further debugging may prove unreliable.
 Quit this debugging session? (y or n) FAIL: gdb.threads/tid-reuse.exp: continue to breakpoint: after_wrap (GDB internal error)
 Resyncing due to internal error.
 n

Here:

 (top-gdb) bt
 #0  internal_error (file=0x953dd8 "src/gdb/thread.c", line=789, fmt=0x953da0 "%s: Assertion `%s' failed.")
     at src/gdb/common/errors.c:54
 #1  0x0000000000638514 in set_running (ptid=..., running=1) at src/gdb/thread.c:789
 #2  0x00000000004bda42 in linux_handle_extended_wait (lp=0x16f5760, status=0, stopping=0) at src/gdb/linux-nat.c:2114
 #3  0x00000000004bfa24 in linux_nat_filter_event (lwpid=20570, status=198015) at src/gdb/linux-nat.c:3127
 #4  0x00000000004c070e in linux_nat_wait_1 (ops=0xe193d0, ptid=..., ourstatus=0x7fffffffd2c0, target_options=1) at src/gdb/linux-nat.c:3478
 #5  0x00000000004c1015 in linux_nat_wait (ops=0xe193d0, ptid=..., ourstatus=0x7fffffffd2c0, target_options=1) at src/gdb/linux-nat.c:3722
 #6  0x00000000004c92d2 in thread_db_wait (ops=0xd80b60 <thread_db_ops>, ptid=..., ourstatus=0x7fffffffd2c0, options=1)
     at src/gdb/linux-thread-db.c:1525
 #7  0x000000000066db43 in delegate_wait (self=0xd80b60 <thread_db_ops>, arg1=..., arg2=0x7fffffffd2c0, arg3=1) at src/gdb/target-delegates.c:116
 #8  0x000000000067e54b in target_wait (ptid=..., status=0x7fffffffd2c0, options=1) at src/gdb/target.c:2206
 #9  0x0000000000625111 in fetch_inferior_event (client_data=0x0) at src/gdb/infrun.c:3275
 #10 0x0000000000648a3b in inferior_event_handler (event_type=INF_REG_EVENT, client_data=0x0) at src/gdb/inf-loop.c:56
 #11 0x00000000004c2ecb in handle_target_event (error=0, client_data=0x0) at src/gdb/linux-nat.c:4655

I managed to come up with a test that reliably reproduces this.  It
relies on pids wrapping around though, so could potentially take a
while.  On my box that's 4 seconds; on gcc110, a PPC box which has
max_pid set to 65536, it's over 10 seconds.  So I made the test
compute how long that would take, and cap the time waited if that
would be too long.

Tested on x86_64 Fedora 20.

gdb/ChangeLog:
2015-03-26  Pedro Alves  <palves@redhat.com>

	* linux-thread-db.c (record_thread): Readd the thread to gdb's
	list if it was marked exited.

gdb/testsuite/ChangeLog:
2015-03-26  Pedro Alves  <palves@redhat.com>

	* gdb.threads/tid-reuse.c: New file.
	* gdb.threads/tid-reuse.exp: New file.
---
 gdb/linux-thread-db.c                   |   6 +-
 gdb/testsuite/gdb.threads/tid-reuse.c   | 149 ++++++++++++++++++++++++++++++++
 gdb/testsuite/gdb.threads/tid-reuse.exp |  84 ++++++++++++++++++
 3 files changed, 237 insertions(+), 2 deletions(-)
 create mode 100644 gdb/testsuite/gdb.threads/tid-reuse.c
 create mode 100644 gdb/testsuite/gdb.threads/tid-reuse.exp

Comments

Mark Kettenis March 26, 2015, 1:05 p.m. UTC | #1
> From: Pedro Alves <palves@redhat.com>
> Date: Thu, 26 Mar 2015 12:45:50 +0000
> 
> I managed to come up with a test that reliably reproduces this.  It
> relies on pids wrapping around though, so could potentially take a
> while.  On my box that's 4 seconds; on gcc110, a PPC box which has
> max_pid set to 65536, it's over 10 seconds.  So I made the test
> compute how long that would take, and cap the time waited if that
> would be too long.

You can't really count on thread IDs wrapping on systems that
randomize them.  There is still a chance that you'll reuse on of
course.  Not much you can do about this, but folks should be aware
that this may cause non-reproducable test results on some systems.
diff mbox

Patch

diff --git a/gdb/linux-thread-db.c b/gdb/linux-thread-db.c
index 88094a7..886d8ac 100644
--- a/gdb/linux-thread-db.c
+++ b/gdb/linux-thread-db.c
@@ -1346,8 +1346,10 @@  record_thread (struct thread_db_info *info,
   priv->tid = ti_p->ti_tid;
   update_thread_state (priv, ti_p);
 
-  /* Add the thread to GDB's thread list.  */
-  if (tp == NULL)
+  /* Add the thread to GDB's thread list.  If we already know about a
+     thread with this PTID, but it's marked exited, then the kernel
+     reused the tid of an old thread.  */
+  if (tp == NULL || tp->state == THREAD_EXITED)
     tp = add_thread_with_info (ptid, priv);
   else
     tp->priv = priv;
diff --git a/gdb/testsuite/gdb.threads/tid-reuse.c b/gdb/testsuite/gdb.threads/tid-reuse.c
new file mode 100644
index 0000000..9127fe7
--- /dev/null
+++ b/gdb/testsuite/gdb.threads/tid-reuse.c
@@ -0,0 +1,149 @@ 
+/* This testcase is part of GDB, the GNU debugger.
+
+   Copyright 2015 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+#define _GNU_SOURCE
+#include <assert.h>
+#include <pthread.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <limits.h>
+
+/* Number of threads spawned.  */
+unsigned long thread_counter;
+
+/* How long it takes for the tid number space to wrap around, in
+   seconds.  It'll be capped to a lower value if we can't compute
+   it.  */
+unsigned int wrap_time = -1;
+
+/* How many threads fit in the target's thread number space before tid
+   wrapping occurs.  */
+int tid_wrap = -1;
+
+void *
+do_nothing_thread_func (void *arg)
+{
+  usleep (1);
+  return NULL;
+}
+
+void *
+spawner_thread_func (void *arg)
+{
+  while (1)
+    {
+      pthread_t child;
+      int rc;
+
+      thread_counter++;
+
+      rc = pthread_create (&child, NULL, do_nothing_thread_func, NULL);
+      assert (rc == 0);
+
+      rc = pthread_join (child, NULL);
+      assert (rc == 0);
+    }
+
+  return NULL;
+}
+
+/* Called after the program is done counting number of spawned threads
+   for a period, to compute WRAP_TIME.  */
+
+void
+after_count (void)
+{
+}
+
+/* Called after enough time has passed for TID wrapping to occur.  */
+
+void
+after_wrap (void)
+{
+}
+
+#ifdef __linux__
+
+/* Get the running system's configured pid_max.  */
+
+static int
+linux_proc_get_pid_max (void)
+{
+  static const char filename[]  ="/proc/sys/kernel/pid_max";
+  FILE *file;
+  char buf[100];
+  int retval = -1;
+
+  file = fopen (filename, "r");
+  if (file == NULL)
+    {
+      fprintf (stderr, "unable to open %s\n", filename);
+      return -1;
+    }
+
+  if (fgets (buf, sizeof (buf), file) != NULL)
+    retval = strtol (buf, NULL, 10);
+
+  fclose (file);
+  return retval;
+}
+
+#endif
+
+int
+main (int argc, char *argv[])
+{
+  pthread_t child;
+  int rc;
+  int wrap_time_raw = 0;
+
+  rc = pthread_create (&child, NULL, spawner_thread_func, NULL);
+  assert (rc == 0);
+
+#define COUNT_TIME 2
+  sleep (COUNT_TIME);
+
+#ifdef __linux__
+  tid_wrap = linux_proc_get_pid_max ();
+#endif
+  /* If we don't know how many threads it would take to wrap around on
+     this system, just run the test for a bit.  */
+  if (tid_wrap > 0)
+    {
+      wrap_time_raw = tid_wrap / ((float) thread_counter / COUNT_TIME) + 0.5;
+
+      /* Give it a bit more, just in case.  */
+      wrap_time = wrap_time_raw + 3;
+    }
+
+  /* 4 seconds were sufficient on the machine this was first observed,
+     an Intel i7-2620M @ 2.70GHz running Linux 3.18.7, with
+     pid_max=32768.  Going forward, as machines get faster, this will
+     need less time, unless pid_max is set to a very high number.  To
+     avoid unreasonably long test time, cap to an upper bound.  */
+  if (wrap_time > 60)
+    wrap_time = 60;
+  printf ("thread_counter=%lu, tid_wrap = %d, wrap_time_raw=%u, wrap_time=%u\n",
+	  thread_counter, tid_wrap, wrap_time_raw, wrap_time);
+  after_count ();
+
+  sleep (wrap_time);
+
+  after_wrap ();
+  return 0;
+}
diff --git a/gdb/testsuite/gdb.threads/tid-reuse.exp b/gdb/testsuite/gdb.threads/tid-reuse.exp
new file mode 100644
index 0000000..cf5398c
--- /dev/null
+++ b/gdb/testsuite/gdb.threads/tid-reuse.exp
@@ -0,0 +1,84 @@ 
+# Copyright 2015 Free Software Foundation, Inc.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# Test running a program that spawns enough threads that the tid
+# number space wraps around, all while having an exited selected
+# thread.  At some point, the exited thread's tid is reused.  GDB
+# should not crash when this happens.
+
+standard_testfile
+
+set options { "additional_flags=-DTIMEOUT=$timeout" debug pthreads }
+
+if {[prepare_for_testing "failed to prepare" $testfile $srcfile { debug pthreads }] == -1} {
+    return -1
+}
+
+clean_restart ${binfile}
+
+if ![runto main] {
+    fail "Can't run to main"
+    return -1
+}
+
+delete_breakpoints
+
+# Avoid dumping a ton of thread create/exit info in the logs.
+gdb_test_no_output "set print thread-events off"
+
+gdb_breakpoint "after_count"
+gdb_continue_to_breakpoint "after_count"
+
+# Get value of VARIABLE in the inferior.
+
+proc getvar {variable} {
+    global decimal
+    global gdb_prompt
+
+    set value 0
+
+    set msg "get $variable"
+    gdb_test_multiple "print $variable" $msg {
+	-re " = ($decimal)\r\n$gdb_prompt $" {
+	    set value $expect_out(1,string)
+	    pass $msg
+	}
+    }
+    return $value
+}
+
+set inf_timeout [getvar "wrap_time"]
+
+# Now the real test.  Run to a breakpoint in a thread that exits
+# immediately once resumed.  The thread ends up left on the thread
+# list, marked exited (exactly because it's the selected thread).
+gdb_breakpoint "do_nothing_thread_func"
+gdb_continue_to_breakpoint "do_nothing_thread_func"
+
+delete_breakpoints
+
+# Let the program continue, constantly spawning short-lived threads
+# (one at a time).  On some targets (e.g., GNU/Linux), after a bit, a
+# new thread reuses the tid of the old exited thread that we still
+# have selected.  GDB should not crash in this situation.  Of course,
+# if the tid number space is shared between all processes in the
+# system (such as on Linux), there's a chance that some other process
+# grabs the TID, but that can never cause a spurious test fail.
+gdb_breakpoint "after_wrap"
+
+# Higher than what the test program sleeps before exiting.
+set timeout [expr $inf_timeout * 2]
+
+gdb_continue_to_breakpoint "after_wrap"