Hive是如何解析SQL的呢,首先拿hive的建表语句来举例,比如下面的建表语句
1 2 create table test(id int,name string)row format delimited fields terminated by '\t';
然后使用hive的show create table语句来查看创建的表结构,这是一张text表
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 CREATE TABLE `test`( `id` int, `name` string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' WITH SERDEPROPERTIES ( 'field.delim'='\t', 'serialization.format'='\t') STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' LOCATION 'hdfs://master:8020/user/hive/warehouse/test' TBLPROPERTIES ( 'transient_lastDdlTime'='1568561230')
当然还有其他各种建表语句,比如
csv表
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 CREATE EXTERNAL TABLE `default.test_1`( `key` string COMMENT 'from deserializer', `value` string COMMENT 'from deserializer') ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' WITH SERDEPROPERTIES ( 'escapeChar'='\\', 'quoteChar'='\'', 'separatorChar'='\t') STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' LOCATION 'hdfs://master:8020/user/hive/warehouse/test' TBLPROPERTIES ( 'COLUMN_STATS_ACCURATE'='false', 'numFiles'='0', 'numRows'='-1', 'rawDataSize'='-1', 'totalSize'='0', 'transient_lastDdlTime'='xxxx')
parquet表
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 CREATE TABLE `default.test`( `time` string, `server` int, `id` bigint) PARTITIONED BY ( `ds` string) ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' WITH SERDEPROPERTIES ( 'field.delim'='\t', 'serialization.format'='\t') STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' LOCATION 'hdfs://master:8020/user/hive/warehouse/test' TBLPROPERTIES ( 'transient_lastDdlTime'='xxxx')
json表
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 CREATE EXTERNAL TABLE `default.test`( `titleid` string COMMENT 'from deserializer', `timestamp` string COMMENT 'from deserializer') ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' LOCATION 'hdfs://master:8020/user/hive/warehouse/test' TBLPROPERTIES ( 'COLUMN_STATS_ACCURATE'='false', 'numFiles'='0', 'numRows'='-1', 'rawDataSize'='-1', 'totalSize'='0',
es表
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 CREATE EXTERNAL TABLE `default.test`( `id` string COMMENT 'from deserializer', `ts` string COMMENT 'from deserializer', ') PARTITIONED BY ( `ds` string) ROW FORMAT SERDE 'org.elasticsearch.hadoop.hive.EsSerDe' STORED BY 'org.elasticsearch.hadoop.hive.EsStorageHandler' WITH SERDEPROPERTIES ( 'serialization.format'='1') LOCATION 'hdfs://master:8020/user/hive/warehouse/test' TBLPROPERTIES ( 'es.index.auto.create'='yes', 'es.index.read.missing.as.empty'='yes', 'es.nodes'='host1,host2', 'es.port'='9200', 'es.resource'='index1/type1',
使用thrift的binary表
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 CREATE EXTERNAL TABLE `default.test`( `bbb` string COMMENT 'from deserializer', `aaa` string COMMENT 'from deserializer') COMMENT 'aas' PARTITIONED BY ( `ds` string COMMENT '日期分区') ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.thrift.ThriftDeserializer' WITH SERDEPROPERTIES ( 'serialization.class'='com.xxx.xxx.xxx.tables.v1.XXXX', 'serialization.format'='org.apache.thrift.protocol.TCompactProtocol') STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.SequenceFileInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat' LOCATION 'hdfs://master:8020/user/hive/warehouse/test' TBLPROPERTIES ( 'transient_lastDdlTime'='xxxxxx')
hbase表
1 2 3 4 5 6 7 8 CREATE EXTERNAL TABLE default.hbase_table(key string, k1 string,k2 string,k3 string,k4 string,ts string) ROW FORMAT SERDE 'org.apache.hadoop.hive.hbase.HBaseSerDe' STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' WITH SERDEPROPERTIES ("hbase.columns.mapping"=":key,c:k1,c:k2,c:k3,c:k4,c:ts") TBLPROPERTIES("hbase.table.name" = "xxxxx");
mongo表
1 2 3 4 5 6 7 8 CREATE EXTERNAL TABLE IF NOT EXISTS xx.xx ( k1 string, k2 int ) STORED BY 'com.mongodb.hadoop.hive.MongoStorageHandler' WITH SERDEPROPERTIES('mongo.columns.mapping'='{"k1":"k1","k2":"k2"}','mongo.input.split_size'='16384') TBLPROPERTIES('mongo.uri'='mongodb://xxx:xxx/table1.collection1');
等等
可以查看show create table的hive源码
1 2 https://github.com/apache/hive/blob/68ae4a5cd1b916098dc1deb2bcede5f862afd80e/ql/src/java/org/apache/hadoop/hive/ql/ddl/table/creation/ShowCreateTableOperation.java
其中可以看出hive表的一些基本信息
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 private static final String CREATE_TABLE_TEMPLATE = "CREATE <" + TEMPORARY + "><" + EXTERNAL + ">TABLE `<" + NAME + ">`(\n" + "<" + LIST_COLUMNS + ">)\n" + "<" + COMMENT + ">\n" + "<" + PARTITIONS + ">\n" + "<" + BUCKETS + ">\n" + "<" + SKEWED + ">\n" + "<" + ROW_FORMAT + ">\n" + "<" + LOCATION_BLOCK + ">" + "TBLPROPERTIES (\n" + "<" + PROPERTIES + ">)\n"; private String getCreateTableCommand(Table table) { ST command = new ST(CREATE_TABLE_TEMPLATE); command.add(NAME, desc.getTableName()); command.add(TEMPORARY, getTemporary(table)); command.add(EXTERNAL, getExternal(table)); command.add(LIST_COLUMNS, getColumns(table)); command.add(COMMENT, getComment(table)); command.add(PARTITIONS, getPartitions(table)); command.add(BUCKETS, getBuckets(table)); command.add(SKEWED, getSkewed(table)); command.add(ROW_FORMAT, getRowFormat(table)); command.add(LOCATION_BLOCK, getLocationBlock(table)); command.add(PROPERTIES, getProperties(table)); return command.render(); }
当用户输入一行create table语句的时候,可查看源码
1 2 https://github.com/apache/hive/blob/ff98efa7c6f2b241d8fddd0ac8dc55e817ecb234/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseUtils.java
美团点评 Hive SQL的编译过程
1 2 https://tech.meituan.com/2014/02/12/hive-sql-to-mapreduce.html
其中可以看到,建表语句首先会使用antlr4将其转换成一颗语法树
1 2 3 4 public static ASTNode parse(String command) throws ParseException { return parse(command, null); }
然后可以使用getTable抽取其中的库名和表名
1 2 https://github.com/apache/hive/blob/f37c5de6c32b9395d1b34fa3c02ed06d1bfbf6eb/ql/src/java/org/apache/hadoop/hive/ql/parse/AnalyzeCommandUtils.java
源码
1 2 3 4 5 6 7 public static Table getTable(ASTNode tree, BaseSemanticAnalyzer sa) throws SemanticException { String tableName = ColumnStatsSemanticAnalyzer.getUnescapedName((ASTNode) tree.getChild(0).getChild(0)); String currentDb = SessionState.get().getCurrentDatabase(); String [] names = Utilities.getDbTableName(currentDb, tableName); return sa.getTable(names[0], names[1], true); }
1 2 https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseDriver.java
1 2 3 4 public ASTNode parse(String command) throws ParseException { return parse(command, null); }
然后比如要提取inputformat,outpurformat,serde和storageHandler
1 2 https://github.com/apache/hive/blob/f37c5de6c32b9395d1b34fa3c02ed06d1bfbf6eb/ql/src/java/org/apache/hadoop/hive/ql/parse/StorageFormat.java
源码
要提取字段信息,SkewedValue,表名以及row format
1 2 https://github.com/apache/hive/blob/f37c5de6c32b9395d1b34fa3c02ed06d1bfbf6eb/ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java
源码
1 2 3 4 5 6 7 8 9 public static List<FieldSchema> getColumns( ASTNode ast, boolean lowerCase, TokenRewriteStream tokenRewriteStream, List<SQLPrimaryKey> primaryKeys, List<SQLForeignKey> foreignKeys, List<SQLUniqueConstraint> uniqueConstraints, List<SQLNotNullConstraint> notNullConstraints, List<SQLDefaultConstraint> defaultConstraints, List<SQLCheckConstraint> checkConstraints, Configuration conf) throws SemanticException { 我是源码 }
源码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 /** * Get the unqualified name from a table node. * * This method works for table names qualified with their schema (e.g., "db.table") * and table names without schema qualification. In both cases, it returns * the table name without the schema. * * @param node the table node * @return the table name without schema qualification * (i.e., if name is "db.table" or "table", returns "table") */ public static String getUnescapedUnqualifiedTableName(ASTNode node) { assert node.getChildCount() <= 2; if (node.getChildCount() == 2) { node = (ASTNode) node.getChild(1); } return getUnescapedName(node); }
源码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 protected void analyzeRowFormat(ASTNode child) throws SemanticException { child = (ASTNode) child.getChild(0); int numChildRowFormat = child.getChildCount(); for (int numC = 0; numC < numChildRowFormat; numC++) { ASTNode rowChild = (ASTNode) child.getChild(numC); switch (rowChild.getToken().getType()) { case HiveParser.TOK_TABLEROWFORMATFIELD: fieldDelim = unescapeSQLString(rowChild.getChild(0) .getText()); if (rowChild.getChildCount() >= 2) { fieldEscape = unescapeSQLString(rowChild .getChild(1).getText()); } break; case HiveParser.TOK_TABLEROWFORMATCOLLITEMS: collItemDelim = unescapeSQLString(rowChild .getChild(0).getText()); break; case HiveParser.TOK_TABLEROWFORMATMAPKEYS: mapKeyDelim = unescapeSQLString(rowChild.getChild(0) .getText()); break; case HiveParser.TOK_TABLEROWFORMATLINES: lineDelim = unescapeSQLString(rowChild.getChild(0) .getText()); if (!lineDelim.equals("\n") && !lineDelim.equals("10")) { throw new SemanticException(SemanticAnalyzer.generateErrorMessage(rowChild, ErrorMsg.LINES_TERMINATED_BY_NON_NEWLINE.getMsg())); } break; case HiveParser.TOK_TABLEROWFORMATNULL: nullFormat = unescapeSQLString(rowChild.getChild(0) .getText()); break; default: throw new AssertionError("Unkown Token: " + rowChild); } } } }
分区信息,首先通过取得Map对象,
1 2 https://github.com/apache/hive/blob/6f18bbbc2e030ce7d446b2475037203cbd4f860d/ql/src/java/org/apache/hadoop/hive/ql/parse/AnalyzeCommandUtils.java
源码
1 2 3 4 5 6 7 8 9 10 public static Map<String,String> getPartKeyValuePairsFromAST(Table tbl, ASTNode tree, HiveConf hiveConf) throws SemanticException { ASTNode child = ((ASTNode) tree.getChild(0).getChild(1)); Map<String,String> partSpec = new HashMap<String, String>(); if (child != null) { partSpec = DDLSemanticAnalyzer.getValidatedPartSpec(tbl, child, hiveConf, false); } //otherwise, it is the case of analyze table T compute statistics for columns; return partSpec; }
再转换成List对象
1 2 https://github.com/apache/hive/blob/556531182dc989e12fd491d951b353b4df13fd47/ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java
源码
1 2 3 public Map<String, String> partSpec; // has to use LinkedHashMap to enforce order<br />public List<Partition> partitions; // involved partitions in TableScanOperator/FileSinkOperator partitions = db.getPartitions(table, partSpec);
location信息,parsedLocation
1 2 https://github.com/apache/hive/blob/0213afb8a31af1f48d009edd41cec9e6c8942354/ql/src/java/org/apache/hadoop/hive/ql/parse/ImportSemanticAnalyzer.java