6 Replies Latest reply: Jun 28, 2011 4:22 AM by "Andrei Costache, Oracle-Oracle" RSS

    segmentation violation in BDB code in macro SH_TAILQ_REMOVE

    857786
      Hello OTN Forum readers

      I wrote an application using BDB-5.1.19 using replication manager distributed into 2 sites
      Sometimes after network partition(no link) between the two nodes

      nsites=2
      priority=100
      ack=quorum


      I get a segmentation violation in the BDB code

      [New process 3489]
      #0 0x00002af382d07885 in __env_alloc_free (infop=0x1418fda0,
      ptr=0x2aaaad55b920) at ../src/env/env_alloc.c:421
      421 SH_TAILQ_REMOVE(&head->addrq, elp, addrq, __alloc_element);
      (gdb) bt
      #0 0x00002af382d07885 in __env_alloc_free (infop=0x1418fda0,
      ptr=0x2aaaad55b920) at ../src/env/env_alloc.c:421
      #1 0x00002af382c58b04 in __rep_grow_sites (env=0x1418f620, nsites=2)
      at ../src/rep/rep_elect.c:1336
      #2 0x00002af382c5854e in __rep_elect_init (env=0x1418f620, nsites=2,
      nvotes=1, beginp=0x606d1ed4, otally=0x606d1ed0)
      at ../src/rep/rep_elect.c:1152
      #3 0x00002af382c56112 in __rep_elect_int (env=0x1418f620, given_nsites=2,
      nvotes=1, flags=0) at ../src/rep/rep_elect.c:251
      #4 0x00002af382c77581 in __repmgr_elect (env=0x1418f620, nsites=2, nvotes=1,
      failtimep=0x606d2050) at ../src/repmgr/repmgr_elect.c:455
      #5 0x00002af382c76d33 in __repmgr_elect_main (env=0x1418f620,
      th=0x2aaab0008e20) at ../src/repmgr/repmgr_elect.c:168
      #6 0x00002af382c76b9e in __repmgr_elect_thread (argsp=0x2aaab0008e20)
      at ../src/repmgr/repmgr_elect.c:102
      #7 0x0000003753a064a7 in start_thread () from /lib64/libpthread.so.0
      #8 0x00000037532d3c2d in clone () from /lib64/libc.so.6
      Current language: auto; currently c
      (gdb) info locals
      elp = (ALLOC_ELEMENT *) 0x2aaaad55b8f0
      elp_tmp = (ALLOC_ELEMENT *) 0x2aaaad55b8b8
      head = (ALLOC_LAYOUT *) 0x2aaaad55b080
      env = (ENV *) 0x1418f620
      q = (SIZEQ_HEAD *) 0x1418f620
      len = 64
      i = 0 '\0'
      p = (u_int8_t *) 0x2aaaad55b920 <Address 0x2aaaad55b920 out of bounds>
      (gdb) info registers
      rax 0x80000000ffffff9e -9223372032559808610
      rbx 0x2aaaad55b8b8 46912540883128
      rcx 0xffffffffffffffc8 -56
      rdx 0x80000000ffffffd6 -9223372032559808554
      rsi 0x2aaaad55b8f0 46912540883184
      rdi 0x80002aa9ad55b91a -9223325128608859878
      rbp 0x606d1d80 0x606d1d80
      rsp 0x606d1d10 0x606d1d10
      r8 0x606d1ed0 1617764048
      r9 0xffffffff 4294967295
      r10 0x7469736e206e6f69 8388362697781964649
      r11 0x202 514
      r12 0x0 0
      r13 0x5f2d1000 1596788736
      r14 0x0 0
      r15 0x1000 4096
      rip 0x2af382d07885 0x2af382d07885 <__env_alloc_free+407>
      eflags 0x10a93 [ CF AF SF IF OF RF ]
      cs 0x33 51
      ss 0x2b 43
      ds 0x0 0
      es 0x0 0
      fs 0x63 99
      (gdb) x/20i $rip
      *0x2af382d07885 <__env_alloc_free+407>: mov %rax,0x8(%rdi)*
      0x2af382d07889 <__env_alloc_free+411>: mov -0x48(%rbp),%rdx
      0x2af382d0788d <__env_alloc_free+415>: mov -0x48(%rbp),%rax
      0x2af382d07891 <__env_alloc_free+419>: mov 0x8(%rax),%rax
      0x2af382d07895 <__env_alloc_free+423>: lea (%rdx,%rax,1),%rcx
      0x2af382d07899 <__env_alloc_free+427>: mov -0x48(%rbp),%rdx
      0x2af382d0789d <__env_alloc_free+431>: mov -0x48(%rbp),%rax
      0x2af382d078a1 <__env_alloc_free+435>: mov 0x8(%rax),%rax
      0x2af382d078a5 <__env_alloc_free+439>: lea (%rdx,%rax,1),%rax
      0x2af382d078a9 <__env_alloc_free+443>: mov (%rax),%rdx
      0x2af382d078ac <__env_alloc_free+446>: mov -0x48(%rbp),%rax
      0x2af382d078b0 <__env_alloc_free+450>: mov (%rax),%rax
      0x2af382d078b3 <__env_alloc_free+453>: lea (%rdx,%rax,1),%rax
      0x2af382d078b7 <__env_alloc_free+457>: mov %rax,(%rcx)
      0x2af382d078ba <__env_alloc_free+460>:
      jmp 0x2af382d078f8 <__env_alloc_free+522>
      0x2af382d078bc <__env_alloc_free+462>: mov -0x48(%rbp),%rax
      0x2af382d078c0 <__env_alloc_free+466>: mov 0x8(%rax),%rcx
      0x2af382d078c4 <__env_alloc_free+470>: mov -0x48(%rbp),%rdx
      0x2af382d078c8 <__env_alloc_free+474>: mov -0x38(%rbp),%rax
      0x2af382d078cc <__env_alloc_free+478>: mov %rdx,%rbx
      (gdb)

      (gdb) p *env
      $3 = {
      dbenv = 0x1418eeb0,
      mtx_env = 4,
      db_home = 0x12b45890 "/usr/local/bdb/replica_data",
      open_flags = 3987,
      db_mode = 436,
      pid_cache = 3489,
      lockfhp = 0x0,
      env_lref = 0x0,
      recover_dtab = {
      int_dispatch = 0x14187830,
      int_size = 182,
      ext_dispatch = 0x0,
      ext_size = 0
      },
      dir_mode = 0,
      data_len = 100,
      thr_nbucket = 0,
      thr_hashtab = 0x0,
      mutex_iq = 0x0,
      mutex_iq_next = 1,
      mutex_iq_max = 50,
      mtx_dblist = 31,
      ---Type <return> to continue, or q <return> to quit---
      db_ref = 2,
      dblist = {
      tqh_first = 0x2aaab4164dd0,
      tqh_last = 0x2aaab4182c80
      },
      fdlist = {
      tqh_first = 0x14186970,
      tqh_last = 0x2aaab41655a0
      },
      mtx_mt = 32,
      mti = 0,
      mt = 0x0,
      crypto_handle = 0x0,
      lk_handle = 0x14190570,
      lg_handle = 0x14186b00,
      mp_handle = 0x14186a00,
      mutex_handle = 0x12abbf70,
      rep_handle = 0x1418f740,
      tx_handle = 0x1418fed0,
      dbt_usercopy = 0,
      log_verify_wrap = 0x2af382c91a3f <__log_verify_wrap>,
      reginfo = 0x1418fda0,
      test_abort = 0,
      ---Type <return> to continue, or q <return> to quit---
      test_check = 0,
      test_copy = 0,
      flags = 1316
      }


      This is a multithread application and the thread stack relatives to BDB API where

      (gdb) thread 13
      [Switching to thread 13 (process 3599)]#0 0x00000037532cced2 in select ()
      from /lib64/libc.so.6
      (gdb) bt
      #0 0x00000037532cced2 in select () from /lib64/libc.so.6
      #1 0x00002af382d52aca in __os_sleep (env=0x1418f620, secs=1, usecs=0)
      at ../src/os/os_yield.c:90
      #2 0x00002af382d52a79 in __os_yield (env=0x1418f620, secs=1, usecs=0)
      at ../src/os/os_yield.c:48
      #3 0x00002af382c714ae in __env_rep_enter (env=0x1418f620, checklock=0)
      at ../src/rep/rep_util.c:1022
      #4 0x00002af382d24d9d in __log_archive_pp (dbenv=0x1418eeb0,
      listp=0x610d2710, flags=8) at ../src/log/log_archive.c:60
      #5 0x000000000120836c in clean_unused_log (env=0xfffffffffffffdfe)
      at bdb_operations.cpp:664
      #6 0x0000000001206fda in BDBCheckpointThread::Run (this=0x2aaab0008fc0,
      arg=<value optimized out>) at bdb_environment_handle.cpp:48
      #7 0x0000000000bf868d in osiThread::MyRun (Args=0x0)
      at /vobs/fw/include/osiThread.h:60
      #8 0x0000000000f72dee in threadMain (arg=<value optimized out>)
      at osiThreadLauncherLinux.cpp:50
      #9 0x0000003753a064a7 in start_thread () from /lib64/libpthread.so.0
      #10 0x00000037532d3c2d in clone () from /lib64/libc.so.6
      (gdb)


      (gdb) thread 14
      [Switching to thread 14 (process 3596)]#0 0x0000003753a0ab99 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
      (gdb) bt
      #0 0x0000003753a0ab99 in pthread_cond_wait@@GLIBC_2.3.2 ()
      from /lib64/libpthread.so.0
      #1 0x00002af382c7e143 in __repmgr_queue_get (env=0x1418f620, msgp=0x5fcd10b0,
      th=0x2aaab0008cb0) at ../src/repmgr/repmgr_queue.c:90
      #2 0x00002af382c79d9d in message_loop (env=0x1418f620, th=0x2aaab0008cb0)
      at ../src/repmgr/repmgr_msg.c:49
      #3 0x00002af382c79cd4 in __repmgr_msg_thread (argsp=0x2aaab0008cb0)
      at ../src/repmgr/repmgr_msg.c:34
      #4 0x0000003753a064a7 in start_thread () from /lib64/libpthread.so.0
      #5 0x00000037532d3c2d in clone () from /lib64/libc.so.6



      (gdb) thread 15
      [Switching to thread 15 (process 3594)]#0 0x0000003753a0ab99 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
      (gdb) bt
      #0 0x0000003753a0ab99 in pthread_cond_wait@@GLIBC_2.3.2 ()
      from /lib64/libpthread.so.0
      #1 0x00002af382c7e143 in __repmgr_queue_get (env=0x1418f620, msgp=0x5f2d00b0,
      th=0x2aaab0008b40) at ../src/repmgr/repmgr_queue.c:90
      #2 0x00002af382c79d9d in message_loop (env=0x1418f620, th=0x2aaab0008b40)
      at ../src/repmgr/repmgr_msg.c:49
      #3 0x00002af382c79cd4 in __repmgr_msg_thread (argsp=0x2aaab0008b40)
      at ../src/repmgr/repmgr_msg.c:34
      #4 0x0000003753a064a7 in start_thread () from /lib64/libpthread.so.0
      #5 0x00000037532d3c2d in clone () from /lib64/libc.so.6



      The bug is not always appears , so if its difficults to wrote a little program to reproduce the case
      But do you know something about this issue ??
        • 1. Re: segmentation violation in BDB code in macro SH_TAILQ_REMOVE
          524722
          Hi,

          We need more information to be able to determine what might be wrong.
          Can you attempt to reproduce this, even if it doesn't happen 100% of the
          time? When you get it into this state, please post the following information
          from gdb:
          stack trace
          go to frame 1 (rep_grow_sites) and p/x *rep

          You also might consider turning on DB_VERB_REP_ELECT verbose
          messages and posting that as well. Thank you.

          Sue LoVerso
          Oracle
          • 2. Re: segmentation violation in BDB code in macro SH_TAILQ_REMOVE
            524761
            Could you also please answer the following questions about your configuration:

            Do both sites call rep_set_nsites() to set the value to 2 before starting replication?

            Does each site configure the address of the other site by calling repmgr_add_remote_site()?

            Is the master site chosen explicitly by the application, by passing DB_REP_MASTER to repmgr_start() at just one of the sites, instead of by holding an initial election?

            I presume the SEGV occurs only at the one site that is initially not the master site; correct?
            • 3. Re: segmentation violation in BDB code in macro SH_TAILQ_REMOVE
              857786
              unfortunately the core has been removed and I could not provide this information you request...
              • 4. Re: segmentation violation in BDB code in macro SH_TAILQ_REMOVE
                857786
                Q: Do both sites call rep_set_nsites() to set the value to 2 before starting replication?
                ==> Yes


                Q: Does each site configure the address of the other site by calling repmgr_add_remote_site()?
                ==> Yes

                Q: Is the master site chosen explicitly by the application, by passing DB_REP_MASTER to repmgr_start() at just one of the sites, instead of by holding an initial election?
                ===> No the master is choose after initial election

                Q: I presume the SEGV occurs only at the one site that is initially not the master site; correct?
                ===> I'm not sure about the affirmation

                But it's hard to reproduce how can I reproduce this case to respond better at the previous question of the forum
                I loose the core...
                • 5. Re: segmentation violation in BDB code in macro SH_TAILQ_REMOVE
                  524722
                  Just to followup a little. The initial election completes and succeeds and one site becomes the master, correct? Then when you cause the partition, the site that is not the master holds its own election and then that gets the SEGV.

                  Unfortunately we really need to see the contents of the rep structure, because the segv is occurring when trying to free a pointer to space in that structure. That pointer should either be 0 or a valid address.

                  My suspicion is that the 2nd site that remains a client never participated in the initial election. Maybe your attempts at reproducing this end up having the 2nd site participate too. Perhaps you can increase your chance of reproducing it if you briefly delay bringing that 2nd site online so that the 1st site elects itself master. Then when you disconnect them, the 2nd site will hold its own election for the first time. If you do reproduce it, please save the executable and core file in case there are other things to investigate.

                  Sue LoVerso
                  Oracle
                  • 6. Re: segmentation violation in BDB code in macro SH_TAILQ_REMOVE
                    &quot;Andrei Costache, Oracle-Oracle&quot;
                    This issue is currently being handled in MOS SR 3-3921816865. When the issue will be resolved in the SR, if the customer agrees, we will post here a summary of the resolution.

                    --Andrei