1 2 Previous Next 18 Replies Latest reply: Nov 28, 2012 10:18 AM by user500302-Oracle RSS

    SQL Causes server to swap and become unresponsive, What is it doing?

    Tom 55
      SQL Causes server to swap and become unresponsive does it have the same effect with other users??

      *****************
      WARNING - script causes my servers to not respond, requiring reboot after 30 mins
      Only run on a server/vm where a bounce is ok
      *****************

      The script creates a table
      Runs an anonymous block to put 160mb in a single clob
      Runs a simple select with some REGEXP_REPLACE and REGEXP_SUBSTR


      This has been raised with Oracle support.
      Their conclusion is lots of single block reads are causing the server to hang.

      I do not think this is the reason and would like someone else to run it, to see what they think!


      Environment

      on red hat 5.7 and Oracle 11.2.0.3 enterprise edition it seems to hang the server
      [oracle@ ~]$ uname -a
      Linux  2.6.18-274.el5xen #1 SMP Fri Jul 8 17:45:44 EDT 2011 x86_64 x86_64 x86_64 GNU/Linux
      [oracle@edu-db3 ~]$ cat /etc/redhat-release
      Red Hat Enterprise Linux Server release 5.7 (Tikanga)
      on redhat 4.5 Oracle 10.2.0.4 - it seems to just kill the instance
      [oracle@ ~]$ uname -a
      Linux  2.6.9-55.ELxenU #1 SMP Fri Apr 20 16:56:53 EDT 2007 x86_64 x86_64 x86_64 GNU/Linux
      [oracle@edu-db1 ~]$ cat /etc/redhat-release
      Red Hat Enterprise Linux AS release 4 (Nahant Update 5)
      When running the sql oracle consumes all the memory on the server, although SGA and PGA configured well bellow os memory
      SQL> show parameter sga
      NAME TYPE VALUE
      lock_sga boolean FALSE
      pre_page_sga boolean FALSE
      sga_max_size big integer 5G
      sga_target big integer 5G
      
      SQL> show parameter pga
      NAME TYPE VALUE
      pga_aggregate_target big integer 2G
      
      [oracle@ scripts]$ free
      total used free shared buffers cached
      Mem: 8388608 7346232 1042376 0 70060 6498460
      -/+ buffers/cache: 777712 7610896
      Swap: 4161528 1212380 2949148
      Script Output
      SQL> @kill_server.sql
      
      Table created.
      To start with we see oracle using the 5g sga
      PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
      13007 oracle 25 0 5352m 216m 212m R 95.9 2.6 0:19.33 oracle
      163mb clob created
      MB
      163
      SQL Running
      Oracle process using 13gb of memory!!!!!!!!!!!!!
      PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
      13007 oracle 18 0 13.4g 6.5g 210m D 7.3 81.8 0:55.42 oracle
      But PGA has not grown
      SQL> select * from v$pgastat;
      
      NAME VALUE UNIT
      aggregate PGA target parameter 2147483648 bytes
      aggregate PGA auto target 1767444480 bytes
      global memory bound 214743040 bytes
      total PGA inuse 183654400 bytes
      total PGA allocated 248871936 bytes
      maximum PGA allocated 522175488 bytes
      -----------------------------------------------
      -- SCRIPT
      -- WARNING (again) - script causes my servers to not respond, requiring reboot after 30 mins
      -- Only run on a server/vm that does not matter!
      -----------------------------------------------
      
      DROP TABLE testing.clob_store PURGE;
      --
      -- Clob holding table
      --
      CREATE TABLE
        testing.clob_store
      (
        clob_data CLOB
      )
        LOB ("CLOB_DATA") STORE AS BASICFILE (TABLESPACE "TBSCLOB" DISABLE STORAGE IN ROW CHUNK 8192 RETENTION CACHE)
      /
      --
      -- Make a 100mb test clob
      --
      DECLARE
        g_clob  CLOB := EMPTY_CLOB;
        PROCEDURE clob_append(p_line VARCHAR2) IS
          l_line VARCHAR2(2000) := p_line || CHR(10);
        BEGIN
          DBMS_LOB.WRITEAPPEND( g_clob, LENGTH(l_line), l_line );
        END;
      BEGIN
        DBMS_LOB.CREATETEMPORARY(g_clob,TRUE);
        DBMS_LOB.OPEN( g_clob, DBMS_LOB.LOB_READWRITE );
      
      
        clob_append( '<Workbook>' );
      
        clob_append( '<Styles>' );
        FOR i IN 1..1000000 LOOP
          clob_append(' <Styles><Style ss:ID="Default" ss:Name="Normal"><Alignment ss:Vertical="Bottom"/><Borders/><Font/><Interior/><NumberFormat/><Protection/></Style>');
        END LOOP;
        clob_append( '</Styles>' );
      
        clob_append( '<Workbook>' );
        FOR i IN 1..1000000 LOOP
          clob_append('<dummy>sfsdfsdf</dummy>');
        END LOOP;
        clob_append( '</Worksheet>' );
      
        clob_append( '</Workbook>' );
      
        INSERT INTO testing.clob_store(clob_data) VALUES (g_clob);
      
        DBMS_LOB.CLOSE(g_clob);
        DBMS_LOB.FREETEMPORARY(g_clob);
        COMMIT;
      END;
      /
      --
      -- Check sizes
      --
      select
        round(DBMS_LOB.GETLENGTH (clob_data)/1024/1024) mb
      from
        testing.clob_store
      /
      --
      -- Test lob set up
      -- no trace and run simple select
      --
      column rd new_value run_date noprint
      select to_char(sysdate, 'YYYY_MM_DD__HH_MI') rd from dual
      /
      alter session set tracefile_identifier = 'bad_sql_&run_date'
      /
      alter session set events '10046 trace name context forever, level 12'
      /
      --
      -- Some regexp extracts
      --
      SELECT
        REGEXP_REPLACE( REGEXP_REPLACE( REGEXP_SUBSTR( RPO.CLOB_DATA , ' <Styles>.*</Styles>.', 1, 1, 'n' ) , ' </?Styles>.', NULL, 1, 0, 'n' ) , '  <Style ss:ID="Default".*?  </Style>.', NULL, 1, 0, 'n' ) STYLES
      , REGEXP_SUBSTR(RPO.CLOB_DATA, ' <Workbook.*</Workbook>.', 1, 1, 'n') worksheet
      FROM
        testing.clob_store rpo
      /
      Extra info
      SQL Trace
      
      *** 2012-11-12 10:22:03.936
      WAIT #47954520916768: nam='db file sequential read' ela= 28421 file#=12 block#=1324297 blocks=1 obj#=567174 tim=1352715723936618
      WAIT #47954520916768: nam='db file sequential read' ela= 34490 file#=12 block#=1324345 blocks=1 obj#=567174 tim=1352715724000358
      WAIT #47954520916768: nam='db file sequential read' ela= 59165 file#=12 block#=1324298 blocks=1 obj#=567174 tim=1352715724314041
      WAIT #47954520916768: nam='db file sequential read' ela= 68224 file#=12 block#=1324346 blocks=1 obj#=567174 tim=1352715724395729
      
      *** 2012-11-12 10:22:04.631
      WAIT #47954520916768: nam='db file sequential read' ela= 34901 file#=12 block#=1324299 blocks=1 obj#=567174 tim=1352715724631454
      WAIT #47954520916768: nam='db file sequential read' ela= 32349 file#=12 block#=1324347 blocks=1 obj#=567174 tim=1352715724701167
      WAIT #47954520916768: nam='db file sequential read' ela= 109175 file#=12 block#=1324300 blocks=1 obj#=567174 tim=1352715725093610
      WAIT #47954520916768: nam='db file sequential read' ela= 69001 file#=12 block#=1324348 blocks=1 obj#=567174 tim=1352715725194967
      
      *** 2012-11-12 10:22:05.913
      WAIT #47954520916768: nam='db file sequential read' ela= 23154 file#=12 block#=1324301 blocks=1 obj#=567174 tim=1352715725913197
      WAIT #47954520916768: nam='db file sequential read' ela= 31894 file#=12 block#=1324349 blocks=1 obj#=567174 tim=1352715726090547
      WAIT #47954520916768: nam='db file sequential read' ela= 29719 file#=12 block#=1324302 blocks=1 obj#=567174 tim=1352715726457981
      WAIT #47954520916768: nam='db file sequential read' ela= 29533 file#=12 block#=1324350 blocks=1 obj#=567174 tim=1352715726593580
      
      *** 2012-11-12 10:22:07.096
      WAIT #47954520916768: nam='db file sequential read' ela= 121833 file#=12 block#=1324035 blocks=1 obj#=567174 tim=1352715727096799
      WAIT #47954520916768: nam='db file sequential read' ela= 56658 file#=12 block#=1324131 blocks=1 obj#=567174 tim=1352715727322665
      STRACE
      -rw-r--r-- 1 root root 130647 Nov 12 10:29 strace_oracle.log
      7 minutes after last oracle trace entry!!
      brk(0x60013000)                         = 0x5ffef000
      mmap(NULL, 1048576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2b9e772b5000
      brk(0x60013000)                         = 0x5ffef000
      mmap(NULL, 1048576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2b9e773b5000
      brk(0x60013000)                         = 0x5ffef000
      mmap(NULL, 1048576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2b9e774b5000
      brk(0x60013000)                         = 0x5ffef000
      mmap(NULL, 1048576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2b9e775b5000
      brk(0x60013000)                         = 0x5ffef000
      mmap(NULL, 1048576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2b9e776b5000
      Edited by: Tom 55 on Nov 13, 2012 6:39 AM
        • 1. Re: SQL Causes server to swap and become unresponsive
          12cdb
          Welcome to Oracle forum.

          You copy/paste ADDM report..

          I think you have missed the question ? What do you expect from the post ?
          • 2. Re: SQL Causes server to swap and become unresponsive
            Osama_Mustafa
            Welcome to the Forum .
            Could you please format your thread .
            • 3. Re: SQL Causes server to swap and become unresponsive
              Tom 55
              Hi 11g,

              Sorry for a vague question.

              Question is 'does this affect other setups?'

              Oracle support seem to think is just a trivial slow single/multi block read problem.
              I think its much more interesting than that!

              Thanks,
              Tom
              • 4. Re: SQL Causes server to swap and become unresponsive
                12cdb
                Oracle support seem to think is just a trivial slow single/multi block read problem.
                if they said so, I think you can safely ignore.
                • 5. Re: SQL Causes server to swap and become unresponsive
                  Tom 55
                  But the oracle 10046 trace finishes 7 minutes before the OS strace of the OS process.

                  So my question is what is it doing?
                  Why does it keep asking for more and more memory?

                  Thanks,
                  Tom
                  • 6. Re: SQL Causes server to swap and become unresponsive
                    Tom 55
                    Hi 11g,

                    Sql crashes instance so can't run ADDM for that period.

                    I do run a 10046 trace, strace, monitor v$pgastat and top while it is running

                    This shows the oracle process consuming all the memory over 2 twice sga size, even though pga max size is low?
                    I can't see how this is possible?


                    We have a similar SQL in our production server, when the SQL tuning advisor is enabled, then it is run every night causes OS to use all the swap and the server reboots, when process load is about 100.

                    Thanks,
                    Tom
                    • 7. Re: SQL Causes server to swap and become unresponsive
                      Balazs Papp
                      Question is 'does this affect other setups?'
                      yes it does, just ran the script you provided on one of my virtual machines

                      Oracle Linux 6.3, 11.2.0.3.4 EE
                      memory_target=1G,memory_max_target=1G
                      2 CPUs, 4GB physical memory, 4GB swap

                      the oracle process slowly consumed all the memory (physical and swap) in about 20 minutes

                      top shows the process has a 6209 MB virtual and 2.8 GB resident size
                       7294 oracle    20   0 6209m 2.8g 112m D  1.4 72.1   1:12.04 oracle
                      server became nonresponsive to new connections (existing ssh sessions still work though)
                      server load is around 50-80, havent crashed so far
                      • 8. Re: SQL Causes server to swap and become unresponsive
                        Tom 55
                        Thank you for running it Balazs,

                        Any idea what it is doing?

                        I don't think it is block reads :)

                        Tom
                        • 9. Re: SQL Causes server to swap and become unresponsive
                          Dom Brooks
                          I haven't got a database to hand that I can crash.

                          If it's related to PGA usage then PGA will consume what it needs to consume regardless of PGA aggregate target settings etc and regardless of whether it is either developer abuse of PGA or memory leaks.

                          Often the best way to look at this sort of issue is from OS upwards usign oradebug, pstack, etc.


                          Also see
                          http://oracle-randolf.blogspot.co.uk/2010/05/advanced-oracle-troubleshooting-session.html
                          • 10. Re: SQL Causes server to swap and become unresponsive, What is it doing?
                            sb92075
                            http://docs.oracle.com/cd/E11882_01/server.112/e25513/initparams141.htm#REFRN10285

                            "Total memory usage can grow beyond the value of MEMORY_TARGET. For example, memory is allocated to PL/SQL tables and varrays regardless of the value of MEMORY_TARGET as long as memory is available at the operating system level."
                            • 11. Re: SQL Causes server to swap and become unresponsive
                              Tom 55
                              Thank you Dom,

                              When the PGA grows over the target value, I can normally see this v$pgastat.
                              But pgamaxsize stays small while the select is running, while I can still check.

                              Does that mean maxpgasize is not updated?

                              Thanks,
                              Tom
                              • 12. Re: SQL Causes server to swap and become unresponsive
                                jgarry
                                It might mean the problem is UGA memory management. There appear to be a number of bugs with the way XML and CLOB's interact, perhaps your demo is showing another aspect of that. See http://oracle-randolf.blogspot.com/2010/05/advanced-oracle-troubleshooting-session.html and look at the bugs having to do with clob memory leaks on MOS for some clues on how to investigate this further.
                                • 13. Re: SQL Causes server to swap and become unresponsive, What is it doing?
                                  Iggy Fernandez
                                  I noticed that the brk system calls are failing because the return value is less than the argument value. That may be part of the puzzle.

                                  brk(0x60013000) = 0x5ffef000

                                  http://www.kernel.org/doc/man-pages/online/pages/man2/brk.2.html ... brk() sets the end of the data segment to the value specified by addr, when that value is reasonable, the system has enough memory, and the process does not exceed its maximum data size (see setrlimit(2)) ... On failure, the system call returns the current break.

                                  http://www.kernel.org/doc/man-pages/online/pages/man2/setrlimit.2.html ... RLIMIT_DATA The maximum size of the process's data segment (initialized data, uninitialized data, and heap). This limit affects calls to brk(2)

                                  0x60013000 is 1,610,690,560 in decimal
                                  0x5ffef000 is 1,610,543,104 in decimal

                                  Best of luck in solving this.

                                  Iggy
                                  • 14. Re: SQL Causes server to swap and become unresponsive, What is it doing?
                                    Tom 55
                                    Thanks Iggy, now I know the strace show the os is really out of memory.

                                    Some more Debug, using Tanels tools...
                                    SQL> @snapper stats 5 1 13
                                    Sampling SID 13 with interval 5 seconds, taking 1 snapshots...
                                    
                                    -- Session Snapper v3.52 by Tanel Poder @ E2SN ( http://tech.e2sn.com )
                                    
                                    -------------------------------------------------------------------------------------------------------------------------------------
                                        SID, USERNAME  , TYPE, STATISTIC                                                 ,     HDELTA, HDELTA/SEC,    %TIME, GRAPH
                                    -------------------------------------------------------------------------------------------------------------------------------------
                                         13, TESTING   , STAT, session logical reads                                     ,      1.93k,      386.6,
                                         13, TESTING   , STAT, consistent gets                                           ,      1.93k,      386.6,
                                         13, TESTING   , STAT, consistent gets from cache                                ,      1.93k,      386.6,
                                         13, TESTING   , STAT, consistent gets from cache (fastpath)                     ,        277,       55.4,
                                         13, TESTING   , STAT, consistent gets - examination                             ,       1.1k,      220.8,
                                         13, TESTING   , STAT, logical read bytes from cache                             ,     15.84M,      3.17M,
                                         13, TESTING   , STAT, shared hash latch upgrades - no wait                      ,        552,      110.4,
                                         13, TESTING   , STAT, calls to get snapshot scn: kcmgss                         ,        276,       55.2,
                                         13, TESTING   , STAT, index crx upgrade (positioned)                            ,        552,      110.4,
                                         13, TESTING   , STAT, lob reads                                                 ,        276,       55.2,
                                         13, TESTING   , STAT, index fetch by key                                        ,        276,       55.2,
                                         13, TESTING   , STAT, index scans kdiixs1                                       ,        552,      110.4,
                                    --  End of Stats snap 1, end=2012-11-13 10:54:06, seconds=5
                                    
                                    SQL> @snapper stats 5 1 13
                                    HANG!

                                    So snapper not showing anything?
                                    Over to ostackprof
                                    SQL> @ostackprof 1143 0 5
                                    
                                    Sampling...
                                    
                                    Below is the stack prefix common to all samples:
                                    ------------------------------------------------------------------------
                                    Frame->function()
                                    ------------------------------------------------------------------------
                                    # 34 ->__libc_start_main()
                                    # 33  ->main()
                                    # 32   ->ssthrdmain()
                                    # 31    ->opimai_real()
                                    # 30     ->sou2o()
                                    # 29      ->opidrv()
                                    # 28       ->opiodr()
                                    # 27        ->opiino()
                                    # 26         ->opitsk()
                                    # 25          ->ttcpip()
                                    # 24           ->opiodr()
                                    # 23            ->opifch()
                                    # 22             ->opifch2()
                                    # 21              ->qerstFetch()
                                    # 20               ->qertbFetch()
                                    # 19                ->qerstRowP()
                                    # 18                 ->kpofcr()
                                    # 17                  ->evaopn2()
                                    # 16                   ->evaopn2()
                                    # 15                    ->evaopn2()
                                    # 14                     ->kokle_rxsubstr()
                                    # 13                      ->kole_rxsubstr()
                                    # 12                       ->lxkRegexpSubstrLobNSub()
                                    # 11                        ->lxregexec()
                                    # 10                         ->lxregmatch()
                                    #  ...(see call profile below)
                                    #
                                    # -#--------------------------------------------------------------------
                                    # - Num.Samples -> in call stack()
                                    # ----------------------------------------------------------------------
                                         2 ->lxregmatgpt()->kole_rxrdcb()->koklc_read()->koklread()->koklOutlineRead1()->kdlf_read()->kdl_read1()->kdlprl()->__intel_new_memcpy()->__sighandler()->->
                                         2 ->__sighandler()->->
                                         1 ->lxregmatpush()->__sighandler()->->
                                    Ok we see the regex function running
                                    Server starting to overload now
                                    SQL> @ostackprof 1143 0 5
                                    
                                    Sampling...
                                    
                                    Below is the stack prefix common to all samples:
                                    ------------------------------------------------------------------------
                                    Frame->function()
                                    ------------------------------------------------------------------------
                                    #  ...(see call profile below)
                                    #
                                    # -#--------------------------------------------------------------------
                                    # - Num.Samples -> in call stack()
                                    # ----------------------------------------------------------------------
                                         2 ->__libc_start_main()->main()->ssthrdmain()->opimai_real()->sou2o()->opidrv()->opiodr()->opiino()->opitsk()->ttcpip()->opiodr()->opifch()->opifch2()->qerstFetch()->qertbFetch()->qerstRowP()->kpofcr()->evaopn2()->evaopn2()->evaopn2()->kokle_rxsubstr()->kole_rxsubstr()->lxkRegexpSubstrLobNSub()->lxregexec()->lxregmatch()->lxregmatgpt()->kole_rxrdcb()->koklc_read()->koklread()->koklOutlineRead1()->kdlf_read()->kdl_read1()->kdlprl()->kdlrdb()->kcbgtcr()->__sighandler()->->
                                    
                                         1 ->__sighandler()->->
                                         1 ->__libc_start_main()->main()->ssthrdmain()->opimai_real()->sou2o()->opidrv()->opiodr()->opiino()->opitsk()->ttcpip()->opiodr()->opifch()->opifch2()->qerstFetch()->qertbFetch()->qerstRowP()->kpofcr()->evaopn2()->evaopn2()->evaopn2()->kokle_rxsubstr()->kole_rxsubstr()->lxkRegexpSubstrLobNSub()->lxregexec()->lxregmatch()->lxregmatgpt()->kole_rxrdcb()->koklc_read()->koklread()->koklOutlineRead1()->kdlf_read()->kdl_read1()->kdlprl()->__intel_new_memcpy()->__sighandler()->->
                                         1 ->__libc_start_main()->main()->ssthrdmain()->opimai_real()->sou2o()->opidrv()->opiodr()->opiino()->opitsk()->ttcpip()->opiodr()->opifch()->opifch2()->qerstFetch()->qertbFetch()->qerstRowP()->kpofcr()->evaopn2()->evaopn2()->evaopn2()->kokle_rxsubstr()->kole_rxsubstr()->lxkRegexpSubstrLobNSub()->lxregexec()->lxregmatch()->lxregmatgpt()->__sighandler()->->
                                    
                                    SQL> @ostackprof 1143 0 5
                                    
                                    Sampling...
                                    
                                    Below is the stack prefix common to all samples:
                                    ------------------------------------------------------------------------
                                    Frame->function()
                                    ------------------------------------------------------------------------
                                    #  ...(see call profile below)
                                    #
                                    # -#--------------------------------------------------------------------
                                    # - Num.Samples -> in call stack()
                                    # ----------------------------------------------------------------------
                                         2 ->__sighandler()->->
                                         2 ->__libc_start_main()->main()->ssthrdmain()->opimai_real()->sou2o()->opidrv()->opiodr()->opiino()->opitsk()->ttcpip()->opiodr()->opifch()->opifch2()->qerstFetch()->qertbFetch()->qerstRowP()->kpofcr()->evaopn2()->evaopn2()->evaopn2()->kokle_rxsubstr()->kole_rxsubstr()->lxkRegexpSubstrLobNSub()->lxregexec()->lxregmatch()->lxregmatgpt()->kole_rxrdcb()->koklc_read()->koklread()->koklOutlineRead1()->kdlf_read()->kdl_read1()->kdlprl()->kdlrdb()->kcbgtcr()->__sighandler()->->
                                         1 ->__libc_start_main()->main()->ssthrdmain()->opimai_real()->sou2o()->opidrv()->opiodr()->opiino()->opitsk()->ttcpip()->opiodr()->opifch()->opifch2()->qerstFetch()->qertbFetch()->qerstRowP()->kpofcr()->evaopn2()->evaopn2()->evaopn2()->kokle_rxsubstr()->kole_rxsubstr()->lxkRegexpSubstrLobNSub()->lxregexec()->lxregmatch()->lxregmatpush()->__sighandler()->->
                                    
                                    SQL> @ostackprof 1143 0 5
                                    
                                    Hit CTRL+C to cancel, ENTER to continue...
                                    HANG!


                                    Now I am lost!!

                                    As Iggy points out from the strace, the OS is returning an error back to the Oracle foreground process, is this not handled?
                                    Or is it normal for oracle to ignore this error, and ask for more memory?

                                    Any help would be greatly appreciated.

                                    Thanks,
                                    Tom

                                    Edited by: Tom 55 on Nov 13, 2012 7:54 AM
                                    1 2 Previous Next