PR gdb/15713 - errors from i386_linux_resume lead to lock-up

Message ID 1400676468-2216-1-git-send-email-palves@redhat.com
State Committed
Headers

Commit Message

Pedro Alves May 21, 2014, 12:47 p.m. UTC
  linux_nat_resume is not considering that linux_ops->to_resume may throw:

  /* Mark LWP as not stopped to prevent it from being continued by
     linux_nat_resume_callback.  */
  lp->stopped = 0;

  if (resume_many)
    iterate_over_lwps (ptid, linux_nat_resume_callback, NULL);

If something within linux_nat_resume_callback, GDB leaves the lwp_info
as if the inferior was resumed, while it actually wasn't.

A couple examples, there are possibly others:

 - i386_linux_resume calls target_read which calls QUIT.
 - if the actual ptrace resumption fails in inf_ptrace_resume,
   perror_with_name is called.

If the user tries to kill the inferior at this point (or quit, which
offers to kill), GDB locks up trying to stop the lwp -- if it is
already stopped no new waitpid event gets generated for it.

Fix this by setting the stopped flag earlier, as soon as we collect a
stop event with waitpid, and clearing it always only after resuming
the lwp successfully.

Tested on x86_64 Fedora 20.  Confirmed the lock-up disappears using a
local hack that forces an error in inf_ptrace_resume.

Also fixes a little "set debug lin-lwp" annoyance.  Currently we always see:

 Continuing.
 LLR: Preparing to resume process 6802, 0, inferior_ptid Thread 0x7ffff7fc7740 (LWP 6802)
                                                                                ^^^^^^^^
 RC: Resuming sibling Thread 0x7ffff77c5700 (LWP 6807), 0, resume
 RC: Resuming sibling Thread 0x7ffff7fc6700 (LWP 6806), 0, resume
 RC: Not resuming sibling Thread 0x7ffff7fc7740 (LWP 6802) (not stopped)
                                                 ^^^^^^^^^^^^^^^^^^^^^^^
 LLR: PTRACE_CONT process 6802, 0 (resume event thread)

This patch gets rid of the "Not resuming sibling" line.

2014-05-21  Pedro Alves  <palves@redhat.com>

	PR gdb/15713
	* linux-nat.c (linux_nat_resume_callback): Rename the second
	parameter to 'except'.  Skip LP if it points to EXCEPT.
	(linux_nat_resume): Don't mark the event lwp as not stopped
	before resuming sibling lwps.  Instead ask
	linux_nat_resume_callback to skip the event lwp.  Mark it as not
	stopped after actually resuming it.
	(linux_handle_syscall_trap): Mark the lwp as not stopped after
	resuming it.
	(wait_lwp): Mark the lwp as stopped here.
	(stop_wait_callback): Mark the lwp as not stopped right after
	resuming it.  Don't mark lwps as stopped here.
	(linux_nat_filter_event): Mark the lwp as stopped earlier.
	(linux_nat_wait_1): Don't mark dead lwps as stopped here.
---
 gdb/linux-nat.c | 41 ++++++++++++++++++-----------------------
 1 file changed, 18 insertions(+), 23 deletions(-)
  

Comments

Pedro Alves May 29, 2014, 11:51 a.m. UTC | #1
Sergio mentioned off list that this worked for him too, so I've
pushed it in.

Thanks,
  

Patch

diff --git a/gdb/linux-nat.c b/gdb/linux-nat.c
index e84ee95..0d92b25 100644
--- a/gdb/linux-nat.c
+++ b/gdb/linux-nat.c
@@ -1642,13 +1642,17 @@  resume_lwp (struct lwp_info *lp, int step, enum gdb_signal signo)
     }
 }
 
-/* Resume LWP, with the last stop signal, if it is in pass state.  */
+/* Callback for iterate_over_lwps.  If LWP is EXCEPT, do nothing.
+   Resume LWP with the last stop signal, if it is in pass state.  */
 
 static int
-linux_nat_resume_callback (struct lwp_info *lp, void *data)
+linux_nat_resume_callback (struct lwp_info *lp, void *except)
 {
   enum gdb_signal signo = GDB_SIGNAL_0;
 
+  if (lp == except)
+    return 0;
+
   if (lp->stopped)
     {
       struct thread_info *thread;
@@ -1764,12 +1768,8 @@  linux_nat_resume (struct target_ops *ops,
       return;
     }
 
-  /* Mark LWP as not stopped to prevent it from being continued by
-     linux_nat_resume_callback.  */
-  lp->stopped = 0;
-
   if (resume_many)
-    iterate_over_lwps (ptid, linux_nat_resume_callback, NULL);
+    iterate_over_lwps (ptid, linux_nat_resume_callback, lp);
 
   /* Convert to something the lower layer understands.  */
   ptid = pid_to_ptid (ptid_get_lwp (lp->ptid));
@@ -1778,6 +1778,7 @@  linux_nat_resume (struct target_ops *ops,
     linux_nat_prepare_to_resume (lp);
   linux_ops->to_resume (linux_ops, ptid, step, signo);
   lp->stopped_by_watchpoint = 0;
+  lp->stopped = 0;
 
   if (debug_linux_nat)
     fprintf_unfiltered (gdb_stdlog,
@@ -1864,6 +1865,7 @@  linux_handle_syscall_trap (struct lwp_info *lp, int stopping)
 
       lp->syscall_state = TARGET_WAITKIND_IGNORE;
       ptrace (PTRACE_CONT, ptid_get_lwp (lp->ptid), 0, 0);
+      lp->stopped = 0;
       return 1;
     }
 
@@ -1947,6 +1949,7 @@  linux_handle_syscall_trap (struct lwp_info *lp, int stopping)
     linux_nat_prepare_to_resume (lp);
   linux_ops->to_resume (linux_ops, pid_to_ptid (ptid_get_lwp (lp->ptid)),
 			lp->step, GDB_SIGNAL_0);
+  lp->stopped = 0;
   return 1;
 }
 
@@ -2156,7 +2159,7 @@  linux_handle_extended_wait (struct lwp_info *lp, int status,
 	  linux_ops->to_resume (linux_ops,
 				pid_to_ptid (ptid_get_lwp (lp->ptid)),
 				0, GDB_SIGNAL_0);
-
+	  lp->stopped = 0;
 	  return 1;
 	}
 
@@ -2311,6 +2314,7 @@  wait_lwp (struct lwp_info *lp)
     }
 
   gdb_assert (WIFSTOPPED (status));
+  lp->stopped = 1;
 
   /* Handle GNU/Linux's syscall SIGTRAPs.  */
   if (WIFSTOPPED (status) && WSTOPSIG (status) == SYSCALL_SIGTRAP)
@@ -2564,6 +2568,7 @@  stop_wait_callback (struct lwp_info *lp, void *data)
 
 	  errno = 0;
 	  ptrace (PTRACE_CONT, ptid_get_lwp (lp->ptid), 0, 0);
+	  lp->stopped = 0;
 	  if (debug_linux_nat)
 	    fprintf_unfiltered (gdb_stdlog,
 				"PTRACE_CONT %s, 0, 0 (%s) "
@@ -2590,9 +2595,7 @@  stop_wait_callback (struct lwp_info *lp, void *data)
 
 	  /* Save the sigtrap event.  */
 	  lp->status = status;
-	  gdb_assert (!lp->stopped);
 	  gdb_assert (lp->signalled);
-	  lp->stopped = 1;
 	}
       else
 	{
@@ -2604,8 +2607,6 @@  stop_wait_callback (struct lwp_info *lp, void *data)
 				"SWC: Delayed SIGSTOP caught for %s.\n",
 				target_pid_to_str (lp->ptid));
 
-	  lp->stopped = 1;
-
 	  /* Reset SIGNALLED only after the stop_wait_callback call
 	     above as it does gdb_assert on SIGNALLED.  */
 	  lp->signalled = 0;
@@ -2933,6 +2934,10 @@  linux_nat_filter_event (int lwpid, int status, int *new_pending_p)
   if (!WIFSTOPPED (status) && !lp)
     return NULL;
 
+  /* This LWP is stopped now.  (And if dead, this prevents it from
+     ever being continued.)  */
+  lp->stopped = 1;
+
   /* Handle GNU/Linux's syscall SIGTRAPs.  */
   if (WIFSTOPPED (status) && WSTOPSIG (status) == SYSCALL_SIGTRAP)
     {
@@ -2975,7 +2980,6 @@  linux_nat_filter_event (int lwpid, int status, int *new_pending_p)
 	 used.  */
       if (ptid_get_pid (lp->ptid) == ptid_get_lwp (lp->ptid))
 	{
-	  lp->stopped = 1;
 	  iterate_over_lwps (pid_to_ptid (ptid_get_pid (lp->ptid)),
 			     stop_and_resume_callback, new_pending_p);
 	}
@@ -3320,13 +3324,9 @@  retry:
 				     " cancelled it\n",
 				     ptid_get_lwp (lp->ptid));
 			}
-		      lp->stopped = 1;
 		    }
 		  else
-		    {
-		      lp->stopped = 1;
-		      lp->signalled = 0;
-		    }
+		    lp->signalled = 0;
 		}
 	      else if (WIFEXITED (lp->status) || WIFSIGNALED (lp->status))
 		{
@@ -3343,11 +3343,6 @@  retry:
 		     pending for the next time we're able to report
 		     it.  */
 
-		  /* Prevent trying to stop this thread again.  We'll
-		     never try to resume it because it has a pending
-		     status.  */
-		  lp->stopped = 1;
-
 		  /* Dead LWP's aren't expected to reported a pending
 		     sigstop.  */
 		  lp->signalled = 0;