[12/12] Work-in-progress of path remapping

Message ID 20220622223447.2462880-13-dmalcolm@redhat.com
State New
Headers
Series RFC: Replay of serialized diagnostics |

Commit Message

David Malcolm June 22, 2022, 10:34 p.m. UTC
  This work-in-progress hacks up the file_cache code in input.cc so that
it can have an optional path remapper, which can map e.g. paths in a
.sarif file to paths relative to that .sarif file, so that the
sarif-replayer can locate and display sources.

gcc/ChangeLog:
	* input.cc (file_cache::get_file_path): Replace with...
	(file_cache::get_original_file_path): ...this, and...
	(file_cache::get_remapped_file_path): ...this.
	(file_cache::create): Split "file_path" param into
	"original_file_path" and "remapped_file_path".
	(file_cache::m_file_path): Replace with...
	(file_cache::m_original_file_path): ...this, and...
	(file_cache::m_remapped_file_path): ...this.
	(total_lines_num): Rename file_path to original_file_path.
	(file_cache::lookup_file): Rename file_path to remapped_file_path;
	update cache lookup to use remapped file path.
	(file_cache::remap_file): New.
	(file_cache::set_path_remapper): New.
	(diagnostics_file_cache_set_path_remapper): New.
	(file_cache_slot::evict): Update for split into a pair of paths.
	(file_cache::evicted_cache_tab_entry): Likewise.
	(file_cache::add_file): Likewise.
	(file_cache_slot::create): Likewise.
	(file_cache::file_cache): Initialize m_remapper
	(file_cache::~file_cache): Delete m_remapper.
	(file_cache::lookup_or_add_file): Remap the file path.
	(file_cache_slot::file_cache_slot): Update for changes to fields.
	(file_cache_slot::~file_cache_slot): Free the m_remapped_file_path.
	* input.h (class path_remapper): New.
	(file_cache::set_path_remapper): New decl.
	(file_cache::lookup_file): Update decl.
	(file_cache::add_file): Update decl.
	(file_cache::remap_file): New decl.
	(file_cache::m_remapper): New field.
	(diagnostics_file_cache_set_path_remapper): New decl.
	* sarif/sarif-replay.cc (class sarif_path_remapper): New class.
	(sarif_replayer::emit_sarif_as_diagnostics): Use it.

gcc/testsuite/ChangeLog:
	* sarif/tutorial-example-foo.sarif: Fix the line numbers.
	Update the expected multiline output to show the source code.

Signed-off-by: David Malcolm <dmalcolm@redhat.com>
---
 gcc/input.cc                                  | 107 +++++++++++++-----
 gcc/input.h                                   |  18 ++-
 gcc/sarif/sarif-replay.cc                     |  39 +++++++
 .../sarif/tutorial-example-foo.sarif          |  25 +++-
 4 files changed, 155 insertions(+), 34 deletions(-)
  

Patch

diff --git a/gcc/input.cc b/gcc/input.cc
index 2acbfdea4f8..a78b949faa5 100644
--- a/gcc/input.cc
+++ b/gcc/input.cc
@@ -55,7 +55,8 @@  public:
 		      char ** line, ssize_t *line_len);
 
   /* Accessors.  */
-  const char *get_file_path () const { return m_file_path; }
+  const char *get_original_file_path () const { return m_original_file_path; }
+  const char *get_remapped_file_path () const { return m_remapped_file_path; }
   unsigned get_use_count () const { return m_use_count; }
   bool missing_trailing_newline_p () const
   {
@@ -65,7 +66,10 @@  public:
   void inc_use_count () { m_use_count++; }
 
   bool create (const file_cache::input_context &in_context,
-	       const char *file_path, FILE *fp, unsigned highest_use_count);
+	       const char *original_file_path,
+	       char *remapped_file_path,
+	       FILE *fp,
+	       unsigned highest_use_count);
   void evict ();
 
  private:
@@ -112,11 +116,14 @@  public:
      array.  */
   unsigned m_use_count;
 
-  /* The file_path is the key for identifying a particular file in
+  /* The m_original_file_path is the key for identifying a particular file in
      the cache.
      For libcpp-using code, the underlying buffer for this field is
      owned by the corresponding _cpp_file within the cpp_reader.  */
-  const char *m_file_path;
+  const char *m_original_file_path;
+
+  // FIXME:
+  char *m_remapped_file_path;
 
   FILE *m_fp;
 
@@ -310,11 +317,11 @@  diagnostic_file_cache_fini (void)
    equals the actual number of lines of the file.  */
 
 static size_t
-total_lines_num (const char *file_path)
+total_lines_num (const char *original_file_path)
 {
   size_t r = 0;
   location_t l = 0;
-  if (linemap_get_file_highest_location (line_table, file_path, &l))
+  if (linemap_get_file_highest_location (line_table, original_file_path, &l))
     {
       gcc_assert (l >= RESERVED_LOCATION_COUNT);
       expanded_location xloc = expand_location (l);
@@ -328,16 +335,17 @@  total_lines_num (const char *file_path)
    cached file was found.  */
 
 file_cache_slot *
-file_cache::lookup_file (const char *file_path)
+file_cache::lookup_file (const char *remapped_file_path)
 {
-  gcc_assert (file_path);
+  gcc_assert (remapped_file_path);
 
   /* This will contain the found cached file.  */
   file_cache_slot *r = NULL;
   for (unsigned i = 0; i < num_file_slots; ++i)
     {
       file_cache_slot *c = &m_file_slots[i];
-      if (c->get_file_path () && !strcmp (c->get_file_path (), file_path))
+      if (c->get_remapped_file_path ()
+	  && !strcmp (c->get_remapped_file_path (), remapped_file_path))
 	{
 	  c->inc_use_count ();
 	  r = c;
@@ -350,6 +358,27 @@  file_cache::lookup_file (const char *file_path)
   return r;
 }
 
+// FIXME
+
+char *
+file_cache::remap_file (const char *file_path) const
+{
+  if (m_remapper)
+    return m_remapper->remap_file (file_path);
+  else
+    return xstrdup (file_path);
+  // FIXME: this is probably being called too much
+}
+
+// FIXME:
+
+void
+file_cache::set_path_remapper (path_remapper *remapper)
+{
+  delete m_remapper;
+  m_remapper = remapper;
+}
+
 /* Purge any mention of FILENAME from the cache of files used for
    printing source code.  For use in selftests when working
    with tempfiles.  */
@@ -365,6 +394,15 @@  diagnostics_file_cache_forcibly_evict_file (const char *file_path)
   global_dc->m_file_cache->forcibly_evict_file (file_path);
 }
 
+// FIXME:
+
+void
+diagnostics_file_cache_set_path_remapper (path_remapper *remapper)
+{
+  diagnostic_file_cache_init ();
+  global_dc->m_file_cache->set_path_remapper (remapper);
+}
+
 void
 file_cache::forcibly_evict_file (const char *file_path)
 {
@@ -381,7 +419,9 @@  file_cache::forcibly_evict_file (const char *file_path)
 void
 file_cache_slot::evict ()
 {
-  m_file_path = NULL;
+  m_original_file_path = NULL;
+  free (m_remapped_file_path);
+  m_remapped_file_path = NULL;
   if (m_fp)
     fclose (m_fp);
   m_fp = NULL;
@@ -409,10 +449,10 @@  file_cache::evicted_cache_tab_entry (unsigned *highest_use_count)
   for (unsigned i = 1; i < num_file_slots; ++i)
     {
       file_cache_slot *c = &m_file_slots[i];
-      bool c_is_empty = (c->get_file_path () == NULL);
+      bool c_is_empty = (c->get_original_file_path () == NULL);
 
       if (c->get_use_count () < to_evict->get_use_count ()
-	  || (to_evict->get_file_path () && c_is_empty))
+	  || (to_evict->get_original_file_path () && c_is_empty))
 	/* We evict C because it's either an entry with a lower use
 	   count or one that is empty.  */
 	to_evict = c;
@@ -432,6 +472,7 @@  file_cache::evicted_cache_tab_entry (unsigned *highest_use_count)
   return to_evict;
 }
 
+// FIXME:
 /* Create the cache used for the content of a given file to be
    accessed by caret diagnostic.  This cache is added to an array of
    cache and can be retrieved by lookup_file_in_cache_tab.  This
@@ -439,29 +480,35 @@  file_cache::evicted_cache_tab_entry (unsigned *highest_use_count)
    num_file_slots files are cached.  */
 
 file_cache_slot*
-file_cache::add_file (const char *file_path)
+file_cache::add_file (const char *original_file_path,
+		      char *remapped_file_path)
 {
-
-  FILE *fp = fopen (file_path, "r");
+  FILE *fp = fopen (remapped_file_path, "r");
   if (fp == NULL)
     return NULL;
 
   unsigned highest_use_count = 0;
   file_cache_slot *r = evicted_cache_tab_entry (&highest_use_count);
-  if (!r->create (in_context, file_path, fp, highest_use_count))
+  if (!r->create (in_context, original_file_path, remapped_file_path,
+		  fp, highest_use_count))
     return NULL;
   return r;
 }
 
+// FIXME:
 /* Populate this slot for use on FILE_PATH and FP, dropping any
    existing cached content within it.  */
+// FIXME: take ownership of REMAPPED_FILE_PATH.
 
 bool
 file_cache_slot::create (const file_cache::input_context &in_context,
-			 const char *file_path, FILE *fp,
+			 const char *original_file_path,
+			 char *remapped_file_path,
+			 FILE *fp,
 			 unsigned highest_use_count)
 {
-  m_file_path = file_path;
+  m_original_file_path = original_file_path;
+  m_remapped_file_path = remapped_file_path;
   if (m_fp)
     fclose (m_fp);
   m_fp = fp;
@@ -474,19 +521,20 @@  file_cache_slot::create (const file_cache::input_context &in_context,
   /* Ensure that this cache entry doesn't get evicted next time
      add_file_to_cache_tab is called.  */
   m_use_count = ++highest_use_count;
-  m_total_lines = total_lines_num (file_path);
+  m_total_lines = total_lines_num (original_file_path);
   m_missing_trailing_newline = true;
 
+  // FIXME: which file_path should we be using below?
 
   /* Check the input configuration to determine if we need to do any
      transformations, such as charset conversion or BOM skipping.  */
-  if (const char *input_charset = in_context.ccb (file_path))
+  if (const char *input_charset = in_context.ccb (original_file_path))
     {
       /* Need a full-blown conversion of the input charset.  */
       fclose (m_fp);
       m_fp = NULL;
       const cpp_converted_source cs
-	= cpp_get_converted_source (file_path, input_charset);
+	= cpp_get_converted_source (original_file_path, input_charset);
       if (!cs.data)
 	return false;
       if (m_data)
@@ -511,7 +559,8 @@  file_cache_slot::create (const file_cache::input_context &in_context,
 /* file_cache's ctor.  */
 
 file_cache::file_cache ()
-: m_file_slots (new file_cache_slot[num_file_slots])
+: m_file_slots (new file_cache_slot[num_file_slots]),
+  m_remapper (NULL)
 {
   initialize_input_context (nullptr, false);
 }
@@ -521,6 +570,7 @@  file_cache::file_cache ()
 file_cache::~file_cache ()
 {
   delete[] m_file_slots;
+  delete m_remapper;
 }
 
 /* Lookup the cache used for the content of a given file accessed by
@@ -529,11 +579,14 @@  file_cache::~file_cache ()
    it.  */
 
 file_cache_slot*
-file_cache::lookup_or_add_file (const char *file_path)
+file_cache::lookup_or_add_file (const char *original_file_path)
 {
-  file_cache_slot *r = lookup_file (file_path);
+  char *remapped_file_path = remap_file (original_file_path);
+  file_cache_slot *r = lookup_file (remapped_file_path);
   if (r == NULL)
-    r = add_file (file_path);
+    r = add_file (original_file_path, remapped_file_path);
+  else
+    free (remapped_file_path);
   return r;
 }
 
@@ -541,7 +594,8 @@  file_cache::lookup_or_add_file (const char *file_path)
    diagnostic.  */
 
 file_cache_slot::file_cache_slot ()
-: m_use_count (0), m_file_path (NULL), m_fp (NULL), m_data (0),
+: m_use_count (0), m_original_file_path (NULL), m_remapped_file_path (NULL),
+  m_fp (NULL), m_data (0),
   m_alloc_offset (0), m_size (0), m_nb_read (0), m_line_start_idx (0),
   m_line_num (0), m_total_lines (0), m_missing_trailing_newline (true)
 {
@@ -552,6 +606,7 @@  file_cache_slot::file_cache_slot ()
 
 file_cache_slot::~file_cache_slot ()
 {
+  free (m_remapped_file_path);
   if (m_fp)
     {
       fclose (m_fp);
diff --git a/gcc/input.h b/gcc/input.h
index f1ae3aec95c..8539317c513 100644
--- a/gcc/input.h
+++ b/gcc/input.h
@@ -118,6 +118,13 @@  extern bool location_missing_trailing_newline (const char *file_path);
    need to be in this header.  */
 class file_cache_slot;
 
+class path_remapper
+{
+public:
+  virtual ~path_remapper () {}
+  virtual char *remap_file (const char *file_path) const = 0;
+};
+
 /* A cache of source files for use when emitting diagnostics
    (and in a few places in the C/C++ frontends).
 
@@ -145,15 +152,20 @@  class file_cache
   void initialize_input_context (diagnostic_input_charset_callback ccb,
 				 bool should_skip_bom);
 
+  void set_path_remapper (path_remapper *remapper);
+
  private:
   file_cache_slot *evicted_cache_tab_entry (unsigned *highest_use_count);
-  file_cache_slot *add_file (const char *file_path);
-  file_cache_slot *lookup_file (const char *file_path);
+  file_cache_slot *add_file (const char *original_file_path,
+			     char *remapped_file_path);
+  file_cache_slot *lookup_file (const char *remapped_file_path);
+  char *remap_file (const char *file_path) const;
 
  private:
   static const size_t num_file_slots = 16;
   file_cache_slot *m_file_slots;
   input_context in_context;
+  path_remapper *m_remapper;
 };
 
 extern expanded_location
@@ -246,6 +258,8 @@  void diagnostics_file_cache_fini (void);
 
 void diagnostics_file_cache_forcibly_evict_file (const char *file_path);
 
+void diagnostics_file_cache_set_path_remapper (path_remapper *remapper);
+
 class GTY(()) string_concat
 {
 public:
diff --git a/gcc/sarif/sarif-replay.cc b/gcc/sarif/sarif-replay.cc
index 2d5c58ead1e..b8f9c152879 100644
--- a/gcc/sarif/sarif-replay.cc
+++ b/gcc/sarif/sarif-replay.cc
@@ -597,6 +597,37 @@  sarif_replayer::~sarif_replayer ()
     delete iter.second;
 }
 
+// FIXME:
+
+class sarif_path_remapper : public path_remapper
+{
+public:
+  sarif_path_remapper (const char *sarif_filename)
+  {
+    /* Get the directory containing SARIF_FILENAME.  */
+    size_t overall_len = strlen (sarif_filename);
+    const char *last_comp = basename (sarif_filename);
+    gcc_assert (last_comp);
+    size_t dir_len =  last_comp - sarif_filename;
+    m_dir = (char *)xmalloc (dir_len + 1);
+    memcpy (m_dir, sarif_filename, dir_len);
+    m_dir[dir_len] = '\0';
+  }
+  ~sarif_path_remapper ()
+  {
+    free (m_dir);
+  }
+
+  char *remap_file (const char *file_path) const final override
+  {
+    // TODO: what about absolute FILE_PATH ?
+    return concat (m_dir, file_path, NULL);
+  }
+
+private:
+  char *m_dir;
+};
+
 /* Perform one pass of replay of the output file.
    Pass 0 captures the source locations of interest, so that we can generate
    line_maps.
@@ -607,6 +638,14 @@  sarif_replayer::emit_sarif_as_diagnostics (json::value *jv, int pass)
 {
   m_pass = pass;
 
+  /* Remap paths on 2nd pass: that way, errors in the SARIF file get
+     reported directly, whereas replayed diagnostics get remapped.  */
+  if (pass == 1)
+    {
+      diagnostics_file_cache_set_path_remapper
+	(new sarif_path_remapper (m_filename));
+    }
+
   /* We expect a sarifLog object as the top-level value
      (SARIF v2.1.0 section 3.13).  */
   json::object *toplev_obj = dyn_cast <json::object *> (jv);
diff --git a/gcc/testsuite/sarif/tutorial-example-foo.sarif b/gcc/testsuite/sarif/tutorial-example-foo.sarif
index 8fa37ad4b42..d73bbd3b93e 100644
--- a/gcc/testsuite/sarif/tutorial-example-foo.sarif
+++ b/gcc/testsuite/sarif/tutorial-example-foo.sarif
@@ -25,7 +25,7 @@ 
                   "uri": "bad-eval-with-code-flow.py"
                 },
                 "region": {
-                  "startLine": 8
+                  "startLine": 10
                 }
               }
             }
@@ -45,7 +45,7 @@ 
                             "uri": "bad-eval-with-code-flow.py"
                           },
                           "region": {
-                            "startLine": 3
+                            "startLine": 5
                           }
                         }
                       },
@@ -63,7 +63,7 @@ 
                             "uri": "bad-eval-with-code-flow.py"
                           },
                           "region": {
-                            "startLine": 4
+                            "startLine": 6
                           }
                         }
                       },
@@ -81,7 +81,7 @@ 
                             "uri": "bad-eval-with-code-flow.py"
                           },
                           "region": {
-                            "startLine": 38
+                            "startLine": 10
                           }
                         }
                       },
@@ -104,14 +104,27 @@ 
 }
 
 /* { dg-begin-multiline-output "" }
-bad-eval-with-code-flow.py:8:1: warning: Use of tainted variable 'raw_input' in the insecure function 'eval'. [PY2335]
+bad-eval-with-code-flow.py:10:1: warning: Use of tainted variable 'raw_input' in the insecure function 'eval'. [PY2335]
+   10 |     print(eval(raw_input))
+      | ^
   events 1-2
     |
+    |    5 | print("Hello, world!")
+    |      | ^
+    |      | |
+    |      | (1) 
+    |    6 | expr = input("Expression> ")
+    |      | ~
+    |      | |
+    |      | (2) 
     |
     +--> event 3
            |
+           |   10 |     print(eval(raw_input))
+           |      | ^
+           |      | |
+           |      | (3) 
            |
    { dg-end-multiline-output "" } */
 
 // TODO: logical locations?
-// TODO: fix showing the source code