最近几天发现库里有坏块了,环境是11gR2, linux平台的64位的库。以下是我的修复办法,基于dbms_repair做的在线修复,也可以基于备份rman来修复,archivelog,noarchive log可能修复的方式有所不同。
-->首先从alert.log里面发现如下的错误。
DDE: Problem Key 'ORA 1110' was flood controlled (0x1) (no incident)
ORA-01110: data file 8: '/dbTS2/oracle/TESTDB2/oradata3/data/TESTDB2_pool_data_03.dbf'
Byte offset to file# 8 block# 570051 is 374890496
Incident 1567129 created, dump file: 

-->从trace文件里有更详细的描述。
/opt/app/oracle/testdb2/admin/TESTDB2/diag/rdbms/TESTDB2/TESTDB2/incident/incdir_1567129/TESTDB2_o
ra_5396_i1567129.trc
ORA-01578: ORACLE data block corrupted (file # 8, block # 570051)
ORA-01110: data file 8: '/dbTS2/oracle/TESTDB2/oradata3/data/TESTDB2_pool_data_03.dbf'

Dump continued from file: /opt/app/oracle/testdb2/admin/TESTDB2/diag/rdbms/TESTDB2/TESTDB2/trace/TESTDB2_ora_5396.trc
ORA-01578: ORACLE data block corrupted (file # 8, block # 570051)
ORA-01110: data file 8: '/dbTS2/oracle/TESTDB2/oradata3/data/TESTDB2_pool_data_03.dbf'

========= Dump for incident 1567129 (ORA 1578) ========
*** 2013-12-11 07:25:21.257
dbkedDefDump(): Starting incident default dumps (flags=0x2, level=3, mask=0x0)
----- Current SQL Statement for this session (sql_id=7u9gsk798bvrp) -----
SELECT   xxxxx FROM   APP_CONTROL 
AC,   APP_BILL_PROC BL WHERE APP.DATA_GROUP IS NOT NULL AND BL.PROCESS_ID = APP.NXT_PGM_NAME AND  
APP.FILE_STATUS IN ('RD', 'IU', 'CN')
 GROUP BY xxxxxxx

-->尝试查看坏块的segment_type,确认一下是Index还是table segment出问题了。查询没有任何结果。
SQL> select segment_name,tablespace_name,segment_type,block_id,file_id,bytes from dba_extents where block_id=570051 and file_id=8;
no rows selected

-->运行日志中的sql,果断的报错了。
SELECT   xxxxx FROM   APP_CONTROL 
AC,   APP_BILL_PROC BL WHERE APP.DATA_GROUP IS NOT NULL AND BL.PROCESS_ID = APP.NXT_PGM_NAME AND  
APP.FILE_STATUS IN ('RD', 'IU', 'CN')
 GROUP BY xxxxxxx

                                                                                                                        *
ERROR at line 1:
ORA-01578: ORACLE data block corrupted (file # 8, block # 570051)
ORA-01110: data file 8:
'/dbTS2/oracle/TESTDB2/oradata3/data/TESTDB2_pool_data_03.dbf'


-->只是从相关的表里select count没有任何问题。
SQL> select count(*)from APP_CONTROL;
  COUNT(*)
----------
      1613

SQL> select count(*)from APP_BILL_PROC ;
  COUNT(*)
----------
       103

-->再次验证,还是报错。
SELECT   xxxxx FROM   APP_CONTROL 
AC,   APP_BILL_PROC BL WHERE APP.DATA_GROUP IS NOT NULL AND BL.PROCESS_ID = APP.NXT_PGM_NAME AND  
APP.FILE_STATUS IN ('RD', 'IU', 'CN')
 GROUP BY xxxxxxx      
                                                                                                              *
ERROR at line 1:
ORA-01578: ORACLE data block corrupted (file # 8, block # 570051)
ORA-01110: data file 8:
'/dbTS2/oracle/TESTDB2/oradata3/data/TESTDB2_pool_data_03.dbf'

--通过sys来调用dbms_repair来修复。
SQL> BEGIN
  2    DBMS_REPAIR.ADMIN_TABLES (
  3    TABLE_NAME => 'REPAIR_TABLE',
  4    TABLE_TYPE => dbms_repair.repair_table,
  5    ACTION => dbms_repair.create_action,
  6    TABLESPACE => '&tablespace_name');
  7  END;
  8  /
Enter value for tablespace_name: 
old   6:   TABLESPACE => '&tablespace_name');
new   6:   TABLESPACE => 'POOL_DATA');

PL/SQL procedure successfully completed.

-->以上的步骤会生成一个表repair_table
SQL> desc repair_table
 Name                                      Null?    Type
 ----------------------------------------- -------- ----------------------------
 OBJECT_ID                                 NOT NULL NUMBER
 TABLESPACE_ID                             NOT NULL NUMBER
 RELATIVE_FILE_ID                          NOT NULL NUMBER
 BLOCK_ID                                  NOT NULL NUMBER
 CORRUPT_TYPE                              NOT NULL NUMBER
 SCHEMA_NAME                               NOT NULL VARCHAR2(30)
 OBJECT_NAME                               NOT NULL VARCHAR2(30)
 BASEOBJECT_NAME                                    VARCHAR2(30)
 PARTITION_NAME                                     VARCHAR2(30)
 CORRUPT_DESCRIPTION                                VARCHAR2(2000)
 REPAIR_DESCRIPTION                                 VARCHAR2(200)
 MARKED_CORRUPT                            NOT NULL VARCHAR2(10)
 CHECK_TIMESTAMP                           NOT NULL DATE
 FIX_TIMESTAMP                                      DATE
 REFORMAT_TIMESTAMP                                 DATE

-->来定位schema object中的坏块情况
SQL> set serveroutput on
DECLARE num_corrupt INT;
SQL>   2  BEGIN
  3    num_corrupt := 0;
  4    DBMS_REPAIR.CHECK_OBJECT (
  5    SCHEMA_NAME => '&schema_name',
  6    OBJECT_NAME => '&object_name',
  7    REPAIR_TABLE_NAME => 'REPAIR_TABLE',
  8    corrupt_count => num_corrupt);
  9    DBMS_OUTPUT.PUT_LINE('number corrupt: ' || TO_CHAR (num_corrupt));
 10  END;
 11  /
Enter value for schema_name: TSTAPPO2
old   5:   SCHEMA_NAME => '&schema_name',
new   5:   SCHEMA_NAME => 'TSTAPPO2',
Enter value for object_name: APP_CONTROL
old   6:   OBJECT_NAME => '&object_name',
new   6:   OBJECT_NAME => 'APP_CONTROL',
number corrupt: 1

PL/SQL procedure successfully completed.

-->查询生成的坏块表,里面有相应的记录。指向的坏块确实是日志中指定的。
SQL> select BLOCK_ID, CORRUPT_TYPE, CORRUPT_DESCRIPTION
  2  from REPAIR_TABLE;

  BLOCK_ID CORRUPT_TYPE
---------- ------------
CORRUPT_DESCRIPTION
--------------------------------------------------------------------------------
    570051         6148

-->修复坏块
SQL> DECLARE num_fix INT;
  2  BEGIN
  3    num_fix := 0;
  4    DBMS_REPAIR.FIX_CORRUPT_BLOCKS (
  5    SCHEMA_NAME => '&schema_name',
  6    OBJECT_NAME=> '&object_name',
  7    OBJECT_TYPE => dbms_repair.table_object,
  8    REPAIR_TABLE_NAME => 'REPAIR_TABLE',
  9    FIX_COUNT=> num_fix);
 10    DBMS_OUTPUT.PUT_LINE('num fix: ' || to_char(num_fix));
 11  END;
 12  /
Enter value for schema_name: TSTAPPO2
old   5:   SCHEMA_NAME => '&schema_name',
new   5:   SCHEMA_NAME => 'TSTAPPO2',
Enter value for object_name: APP_CONTROL
old   6:   OBJECT_NAME=> '&object_name',
new   6:   OBJECT_NAME=> 'APP_CONTROL',
num fix: 0

PL/SQL procedure successfully completed.

-->对于坏块的操作都能够skip
SQL> BEGIN
  DBMS_REPAIR.SKIP_CORRUPT_BLOCKS (
  2    3    SCHEMA_NAME => '&schema_name',
  4    OBJECT_NAME => '&object_name',
  5    OBJECT_TYPE => dbms_repair.table_object,
  6    FLAGS => dbms_repair.SKIP_FLAG);
  7  END;
  8  /
Enter value for schema_name: TSTAPPO2
old   3:   SCHEMA_NAME => '&schema_name',
new   3:   SCHEMA_NAME => 'TSTAPPO2',
Enter value for object_name: APP_CONTROL
old   4:   OBJECT_NAME => '&object_name',
new   4:   OBJECT_NAME => 'APP_CONTROL',

PL/SQL procedure successfully completed.


-->再次运行以上的sql,尝试。
SQL> l
SELECT   xxxxx FROM   APP_CONTROL 
AC,   APP_BILL_PROC BL WHERE APP.DATA_GROUP IS NOT NULL AND BL.PROCESS_ID = APP.NXT_PGM_NAME AND  
APP.FILE_STATUS IN ('RD', 'IU', 'CN')
 GROUP BY xxxxxxx    ;

working....