13 Replies Latest reply on Mar 17, 2011 8:35 PM by 622376

    Heavy IO (fdatasync) with DB_TXN_NOSYNC

    622376
      Hello,

      Updating 1 byte is very slow (BDB 5.1.25), even with DB_TXN_NOSYNC.
      `strace` reports a "write()" and a "fdatasync()" on any "DB->put()".
      The down below testcase produces heavy IO (if not a RAM drive).

      Thanks
      Josef
      #include <time.h>
      #include <stdio.h>
      #include <stdlib.h>
      #include <string.h>
      #include <db.h>
      #include <sys/ipc.h>
      
      #define DATABASE_DIRECTORY      "/home/double/db"
      #define DATABASE_CACHESIZE      (128 * 1024 * 1024)
      #define RECORD_SIZE             0x1000
      
      int main(void)
      {
          printf( "%s\n", db_version(NULL,NULL,NULL) );
          system( "mkdir -v " DATABASE_DIRECTORY " >/dev/null 2>&1" );
          system( "db_recover " DATABASE_DIRECTORY " >/dev/null 2>&1" );
          system( "rm -fv " DATABASE_DIRECTORY "/* >/dev/null 2>&1" );
      
          // Create environment big enough
          DB_ENV *environment;
          int status = 0;
          status |= db_env_create( &environment, 0 );
          environment->set_flags( environment, DB_TIME_NOTGRANTED, true );
          status |= environment->set_lk_max_lockers( environment, 0x20000 );
          status |= environment->set_lk_max_locks( environment, 0x20000 );
          status |= environment->set_lk_max_objects( environment, 0x20000 );
          status |= environment->set_tx_max( environment, 0x1000 );
          status |= environment->set_lk_detect( environment, DB_LOCK_OLDEST );
          status |= environment->set_timeout( environment, 60 * 1000 * 1000, DB_SET_LOCK_TIMEOUT );
          status |= environment->set_timeout( environment, 60 * 1000 * 1000, DB_SET_TXN_TIMEOUT );
          status |= environment->set_thread_count( environment, 0x1000 );
          status |= environment->set_lg_regionmax( environment, 0x1000000 );
          status |= environment->set_lg_bsize( environment, 0x1000000 );
          status |= environment->set_cachesize( environment, 0, DATABASE_CACHESIZE, 0 );
          status |= environment->set_lg_dir( environment, DATABASE_DIRECTORY );
          status |= environment->set_data_dir( environment, DATABASE_DIRECTORY );
          status |= environment->set_tmp_dir( environment, DATABASE_DIRECTORY );
          key_t shm_key = ftok( DATABASE_DIRECTORY, 'D' );
          environment->set_shm_key( environment, shm_key );
          status |= environment->open( environment, DATABASE_DIRECTORY, DB_CREATE | DB_INIT_MPOOL | /*DB_SYSTEM_MEM |*/ DB_INIT_TXN | DB_INIT_LOCK | DB_INIT_LOG, 0644 );
          environment->set_msgfile( environment, stderr );
          environment->set_verbose( environment, DB_VERB_DEADLOCK, 1 );
          environment->set_verbose( environment, DB_VERB_RECOVERY, 1 );
          environment->set_verbose( environment, DB_VERB_REGISTER, 1 );
          environment->set_verbose( environment, DB_VERB_WAITSFOR, 1 );
      
          // Create table with only one record
          DB *table = NULL;
          status |= db_create( &table, environment, 0 );
          status |= table->open( table, NULL, DATABASE_DIRECTORY "/table.db", NULL, DB_BTREE, DB_CREATE | DB_READ_UNCOMMITTED | DB_AUTO_COMMIT, 0644 );
          {
              uint32_t id = 1;
              DBT key;
              memset( &key, 0, sizeof(DBT) );
              key.data = &id;
              key.size = sizeof(id);
              key.ulen = sizeof(id);
              key.flags = DB_DBT_USERMEM;
      
              char buffer[RECORD_SIZE];
              memset( buffer, 0, sizeof(buffer) );
              DBT data;
              memset( &data, 0, sizeof(DBT) );
              data.data = buffer;
              data.size = sizeof(buffer);
              data.ulen = 0;
              data.flags = DB_DBT_USERMEM;
      
              status |= table->put( table, NULL, &key, &data, 0 );
          }
      
          // Update always one byte of that record
          time_t duration = time(NULL);
          uint32_t i;
          for( i=0; i < 1000; ++i )
          {
              // Load record via cursor
              DB_TXN *transaction = NULL;
              status |= environment->txn_begin( environment, NULL, &transaction, DB_TXN_NOSYNC );       // DB_TXN_WRITE_NOSYNC
      
              DBT key;
              memset( &key, 0, sizeof(DBT) );
              uint32_t id = 1;
              key.data = (void*)&id;
              key.size = sizeof(id);
              key.ulen = sizeof(id);
              key.flags = DB_DBT_USERMEM;
      
              DBT data;
              memset( &data, 0, sizeof(DBT) );
      
              /*
              DBC *cursor;
              status |= table->cursor( table, transaction, &cursor, 0 );
              status |= cursor->c_get( cursor, &key, &data, DB_FIRST | DB_RMW );
              */
              status |= table->get( table, transaction, &key, &data, DB_RMW );
      
              // Change record
              int index = rand() % RECORD_SIZE;
              unsigned char buffer[1];
              buffer[0] = rand() % 0x100;
      
              data.data = (void*)buffer;
              data.size = 1;
              data.ulen = 0;
              data.dlen = 1;
              data.doff = index;
              data.flags = DB_DBT_USERMEM | DB_DBT_PARTIAL;
      
              /*
              status |= cursor->c_put( cursor, &key, &data, DB_CURRENT );
              status |= cursor->c_close( cursor );
              */
              status |= table->put( table, transaction, &key, &data, 0 );
      
              // Commit transaction
              status |= transaction->commit( transaction, DB_TXN_NOSYNC );
      
              // Print status...
              printf( "%.3d: record[%.4d] = %.2x\n", i, index, (unsigned)buffer[0] );
          }
          duration = time(NULL) - duration;
          printf( "Writing took: %d:%2.2d seconds\n", (int)duration / 60, (int)duration % 60 );
      
          // Close
          status |= table->close( table, DB_NOSYNC );
          status |= environment->txn_checkpoint( environment, 0, 0, 0 );
          status |= environment->close( environment, 0 );
      
          // Errors?
          if( status != 0 )
              fprintf(stderr,"Errors\n");
      
          return 0;
      }
        • 1. Re: Heavy IO with DB_TXN_NOSYNC
          &quot;Oracle, Sandra Whitman-Oracle&quot;
          Hello,

          I will take a look and try running your test case.

          Thanks,
          Sandra
          • 2. Re: Heavy IO with DB_TXN_NOSYNC
            622376
            Is there a chance to avoid the `fdatasync()`?

            Thanks
            Josef
            • 3. Re: Heavy IO with DB_TXN_NOSYNC
              &quot;Oracle, Sandra Whitman-Oracle&quot;
              Hi Josef,

              Sorry about the delay here, I will plan to take a closer look today.

              Thank you,
              Sandra
              • 4. Re: Heavy IO with DB_TXN_NOSYNC
                &quot;Oracle, Sandra Whitman-Oracle&quot;
                Hi Josef,

                I ran your test case on RHEL with BDB 5.1.25 and specifically
                looked at the results of calling fdatasync() from the put()
                method. The test case opens the database with
                "DB_AUTO_COMMIT" and then first there is a single put within
                the {} i.e.
                status |= table->put( table, NULL, &key, &data, 0 );

                When I changed the test to open the environment with:
                status = environment->set_flags( environment, DB_TXN_NOSYNC, 1 );

                fdatasync() is not called to synchronously flush the log.


                In the loop where a transaction is explicitly committed, creating
                the transaction with either DB_TXN_NOSYNC or DB_TXN_WRITE_NOSYNC as:
                status |= environment->txn_begin( environment, NULL, &transaction, DB_TXN_NOSYNC );
                status |= environment->txn_begin( environment, NULL, &transaction, DB_TXN_WRITE_NOSYNC );

                did not invoke call into fdatasync().


                Of course in neither case will the transaction be durable.

                Did you get different results? Please let me know if you
                think I misunderstand the problem reported or if I did not do
                something as expected with the test case.

                Thank you,
                Sandra
                • 5. Re: Heavy IO with DB_TXN_NOSYNC
                  &quot;Oracle, Sandra Whitman-Oracle&quot;
                  Ok,
                  I think I see the problem. It is when there is a get. Let me take another look.

                  Sorry about that,
                  Sandra
                  • 6. Re: Heavy IO with DB_TXN_NOSYNC
                    &quot;Oracle, Sandra Whitman-Oracle&quot;
                    Hi Josef,

                    So what is happening in this test is that with DB_DBT_PARTIAL
                    one byte is being replaced during each put i.e.
                    // Change record
                    index = rand() % RECORD_SIZE;
                    buffer[0] = rand() % 0x100;

                    data.data = (void*)buffer;
                    data.size = 1;
                    data.ulen = 0;
                    data.dlen = 1;
                    data.doff = index;
                    data.flags = DB_DBT_USERMEM | DB_DBT_PARTIAL ;

                    If we are about to truncate a page from the database
                    (return it to the filesystem) we need to flush the log
                    first. In this case the record being updated is on an
                    overflow page which also happens to be at the end of the
                    file, i.e. from db_dump table.db we can see:

                    page 1: btree leaf: LSN [1][15238]: level 1
                    prev: 0 next: 0 entries: 2 offset: 4076
                    [000] 4088 len: 4 data: 01000000
                    [001] 4076 overflow: total len: 1934 page: 2
                    page 2: overflow: LSN [1][13228]: level 0
                    prev: 0 next: 0 ref cnt: 1 len: 1934 data: 000000000000...


                    This record is getting freed by the put. Since it is at
                    the end of the file, the file gets truncated, and hence the
                    log gets flushed first. Repeated updating the record,
                    without adding pages to the database will cause this truncation
                    and fdatasync() to happen on each put. I missed this in
                    my previous update because I only ran one iteration of
                    the loop.


                    The stack leading to the fdatasync() looks like:
                    #0 fdatasync ()
                    #1 __os_fsync ../src/os/os_fsync.c:92
                    #2 __log_flush_int ../src/log/log_put.c:1087
                    #3 __log_flush_commit ../src/log/log_put.c:503
                    #4 __log_put ../src/log/log_put.c:302
                    #5 __log_put_record_int ../src/log/log_put.c:1967
                    #6 __log_put_record ../src/log/log_put.c:1708
                    #7 __db_pg_free_log ../src/dbinc_auto/db_auto.h:343
                    #8 __db_free ../src/db/db_meta.c:495
                    #9 __db_doff ../src/db/db_overflow.c:479
                    #10 __bam_ditem ../src/btree/bt_delete.c:138
                    #11 __bam_iitem ../src/btree/bt_put.c:421
                    #12 __bamc_put ../src/btree/bt_cursor.c:2240
                    #13 __dbc_iput ../src/db/db_cam.c:2121
                    #14 __dbc_put ../src/db/db_cam.c:2034
                    #15 __db_put ../src/db/db_am.c:520
                    #16 __db_put_pp ../src/db/db_iface.c:1594
                    #17 main () at ./test.c


                    Thanks,
                    Sandra
                    • 7. Re: Heavy IO with DB_TXN_NOSYNC
                      622376
                      Hi Sandra,

                      Thank you very much for your answer.
                      Yes, you are right. In the down below testcase creating is slow, updating
                      is very fast. There is nothing I can do to avoid this `fdatasync()`?

                      Thanks
                      Josef
                      #include <time.h>
                      #include <stdio.h>
                      #include <stdlib.h>
                      #include <string.h>
                      #include <db.h>
                      #include <sys/ipc.h>
                      
                      #define DATABASE_DIRECTORY      "/home/double/db"
                      #define DATABASE_CACHESIZE      (128 * 1024 * 1024)
                      #define RECORD_SIZE             0x1000
                      
                      int main(void)
                      {
                          uint32_t id, i;
                      
                          printf( "%s\n", db_version(NULL,NULL,NULL) );
                          system( "mkdir -v " DATABASE_DIRECTORY " >/dev/null 2>&1" );
                          system( "db_recover " DATABASE_DIRECTORY " >/dev/null 2>&1" );
                          system( "rm -fv " DATABASE_DIRECTORY "/* >/dev/null 2>&1" );
                      
                          // Create environment big enough
                          DB_ENV *environment;
                          int status = 0;
                          status |= db_env_create( &environment, 0 );
                          environment->set_flags( environment, DB_TIME_NOTGRANTED, true );
                          status |= environment->set_lk_max_lockers( environment, 0x20000 );
                          status |= environment->set_lk_max_locks( environment, 0x20000 );
                          status |= environment->set_lk_max_objects( environment, 0x20000 );
                          status |= environment->set_tx_max( environment, 0x1000 );
                          status |= environment->set_lk_detect( environment, DB_LOCK_OLDEST );
                          status |= environment->set_timeout( environment, 60 * 1000 * 1000, DB_SET_LOCK_TIMEOUT );
                          status |= environment->set_timeout( environment, 60 * 1000 * 1000, DB_SET_TXN_TIMEOUT );
                          status |= environment->set_thread_count( environment, 0x1000 );
                          status |= environment->set_lg_regionmax( environment, 0x1000000 );
                          status |= environment->set_lg_bsize( environment, 0x1000000 );
                          status |= environment->set_cachesize( environment, 0, DATABASE_CACHESIZE, 0 );
                          status |= environment->set_lg_dir( environment, DATABASE_DIRECTORY );
                          status |= environment->set_data_dir( environment, DATABASE_DIRECTORY );
                          status |= environment->set_tmp_dir( environment, DATABASE_DIRECTORY );
                          key_t shm_key = ftok( DATABASE_DIRECTORY, 'D' );
                          environment->set_shm_key( environment, shm_key );
                          status |= environment->open( environment, DATABASE_DIRECTORY, DB_CREATE | DB_INIT_MPOOL | /*DB_SYSTEM_MEM |*/ DB_INIT_TXN | DB_INIT_LOCK | DB_INIT_LOG, 0644 );
                          environment->set_msgfile( environment, stderr );
                          environment->set_verbose( environment, DB_VERB_DEADLOCK, 1 );
                          environment->set_verbose( environment, DB_VERB_RECOVERY, 1 );
                          environment->set_verbose( environment, DB_VERB_REGISTER, 1 );
                          environment->set_verbose( environment, DB_VERB_WAITSFOR, 1 );
                      
                          // Create table with 1000 records
                          DB *table = NULL;
                          status |= db_create( &table, environment, 0 );
                          status |= table->open( table, NULL, DATABASE_DIRECTORY "/table.db", NULL, DB_BTREE, DB_CREATE | DB_READ_UNCOMMITTED | DB_AUTO_COMMIT, 0644 );
                          for( id=0; id < 1000; ++id )
                          {
                              DBT key;
                              memset( &key, 0, sizeof(DBT) );
                              key.data = &id;
                              key.size = sizeof(id);
                              key.ulen = sizeof(id);
                              key.flags = DB_DBT_USERMEM;
                      
                              char buffer[RECORD_SIZE];
                              memset( buffer, 0, sizeof(buffer) );
                              for( i=0; i < RECORD_SIZE; ++i )
                                  buffer[i] = rand() % 0x100;
                              DBT data;
                              memset( &data, 0, sizeof(DBT) );
                              data.data = buffer;
                              data.size = sizeof(buffer);
                              data.ulen = 0;
                              data.flags = DB_DBT_USERMEM;
                      
                              status |= table->put( table, NULL, &key, &data, 0 );
                      
                              // Print status...
                              printf( "Create %.3d\n", id );
                          }
                      
                          // Update this 1000 records
                          time_t duration = time(NULL);
                          for( id=0; id < 1000; ++id )
                          {
                              // Load record via cursor
                              DB_TXN *transaction = NULL;
                              status |= environment->txn_begin( environment, NULL, &transaction, DB_TXN_NOSYNC );       // DB_TXN_WRITE_NOSYNC
                      
                              DBT key;
                              memset( &key, 0, sizeof(DBT) );
                              key.data = (void*)&id;
                              key.size = sizeof(id);
                              key.ulen = sizeof(id);
                              key.flags = DB_DBT_USERMEM;
                      
                              DBT data;
                              memset( &data, 0, sizeof(DBT) );
                      
                              /*
                              DBC *cursor;
                              status |= table->cursor( table, transaction, &cursor, 0 );
                              status |= cursor->c_get( cursor, &key, &data, DB_FIRST | DB_RMW );
                              */
                              status |= table->get( table, transaction, &key, &data, DB_RMW );
                      
                              // Change record
                              for( i=0; i < data.size; ++i )
                                  ((char*)data.data) = rand() % 0x100;

                      /*
                      status |= cursor->c_put( cursor, &key, &data, DB_CURRENT );
                      status |= cursor->c_close( cursor );
                      */
                      status |= table->put( table, transaction, &key, &data, 0 );

                      // Commit transaction
                      status |= transaction->commit( transaction, DB_TXN_NOSYNC );

                      // Print status...
                      printf( "Update %.3d\n", id );
                      }
                      duration = time(NULL) - duration;
                      printf( "Writing took: %d:%2.2d seconds\n", (int)duration / 60, (int)duration % 60 );

                      // Close
                      status |= table->close( table, DB_NOSYNC );
                      status |= environment->txn_checkpoint( environment, 0, 0, 0 );
                      status |= environment->close( environment, 0 );

                      // Errors?
                      if( status != 0 )
                      fprintf(stderr,"Errors\n");

                      return 0;
                      }
                      • 8. Re: Heavy IO with DB_TXN_NOSYNC
                        &quot;Oracle, Sandra Whitman-Oracle&quot;
                        Hi Josef,

                        I just looked at the second program posted. This is a
                        different scenario than what was reported originally.

                        The database open and checkpoint methods do issue
                        calls to fdatasync().

                        However, invoking the put() with an explicit transaction is not calling
                        into fdatasync(). In other words I am not seeing the put() call below syncing
                        the database with a call to fdatasync():

                        status |= environment->txn_begin( environment, NULL, &transaction, DB_TXN_NOSYNC );
                        ...
                        status |= table->put( table, transaction, &key, &data, 0 );
                        ...
                        status |= transaction->commit( transaction, DB_TXN_NOSYNC );


                        In the first case i.e. the case of invoking put() with an implicit
                        transaction created with the DB_AUTO_COMMIT flag fdatasync()
                        calls are made. This can be avoided by setting DB_TXN_NOSYN
                        in the environment like:
                        status = environment->set_flags( environment, DB_TXN_NOSYNC, 1 );


                        Otherwise there is one call to fdatasync() and that is when a
                        new log file is created because the previous one if full. In this
                        case the old log file needs to be flushed and closed out. That
                        fdatasync() can not be avoided.


                        Thank you,
                        Sandra
                        • 9. Re: Heavy IO with DB_TXN_NOSYNC
                          622376
                          Hi Sandra,

                          Yes sorry, the last testcase was a mistake.

                          In my application I always update the last record (fixed size) of a database.
                          Is there anything I can do to avoid an `fdatasync()`?

                          I can't use DB_TXN_NOT_DURABLE, because this flag will damage the
                          database, if an application error occours.

                          Thanks a lot
                          Josef
                          • 10. Re: Heavy IO with DB_TXN_NOSYNC
                            &quot;Oracle, Sandra Whitman-Oracle&quot;
                            Hi Josef,

                            That is the first case your sample code illustrated.
                            As discussed if we are about to truncate a page from the
                            database (return it to the filesystem) we need to flush
                            the log first. If the record is at the end of the file,
                            the file gets truncated, and hence the log gets flushed.
                            Repeated updating the record, without adding pages to
                            the database will cause this truncation and fdatasync()
                            to happen on each put.

                            Thanks,
                            Sandra
                            • 11. Re: Heavy IO with DB_TXN_NOSYNC
                              622376
                              Hi Sandra,

                              Thanks a lot for your answer.
                              Are there any plans to workaround this issue in a future release?
                              Or a flag to prevent this truncate would be great.

                              Thanks
                              Josef
                              • 12. Re: Heavy IO with DB_TXN_NOSYNC
                                &quot;Oracle, Sandra Whitman-Oracle&quot;
                                Hi Josef,

                                In general we cannot avoid flushing the log on a file truncate.
                                The problem is that since the filesystem operation happens
                                immediately we need to have the log on disk prior to truncating
                                the file or else the write ahead protocol is broken.
                                The internal SR #19725 was filed to investigate if something
                                can be done in the particular case raised.

                                Thanks,
                                Sandra
                                • 13. Re: Heavy IO (fdatasync) with DB_TXN_NOSYNC
                                  622376
                                  Thanks a lot!
                                  Josef