Spider 1.0 | Parsing and Text-to-SQL Challenge

Dec 19, 2023 · 1 min read

I presented a part of code that we used in processing the Spider Challenge data.

readSqlite.py

This function reads the information from the .sqlite file. If the .sqlite file had saved examples, then output the TOP 5 examples.

import sqlite3
import os
import json

# 数据库文件所在路径
file_path = ".../spider/original_data/database"
# 表信息文件所在路径
tables_path = ".../spider/extracted_data/tables_info.json"
# 数据保存路径
path_save = ".../spider/extracted_data/readSqlite.json"

# 加载表信息数据
with open(tables_path, 'r', encoding="utf8") as file:
    tables_info = json.load(file)

data = []
for db_name in os.listdir(file_path):
    # # 打印数据库名称
    # print(db_name)
    db_data = sqlite3.connect(file_path + "/" + db_name + "/" + db_name + ".sqlite")
    # 当解码字符串遇到乱码时,替换成“?”号
    db_data.text_factory = lambda b: b.decode(errors = 'replace')
    
    # 读取数据库中包含的表的名字
    db_cur = db_data.cursor()
    db_cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = db_cur.fetchall()

    # 读取表中的信息
    for table in tables:
        table_cur = db_data.cursor()
        table_cur.execute("SELECT * FROM " + table[0])
        info = table_cur.fetchall()
        
        for i in tables_info:
            if i["db_id"] == db_name:
                column_names = i["tables_info"][table[0]]
        
        # print(type(info))
        
        h = {"db_name": db_name, "table_name":table[0], "column_name": column_names, "table_info":info[0:5]}
        data.append( h)

    
# 保存为json文件
with open(path_save, "w", encoding="utf-8") as outfile: 
    json.dump(data, outfile,indent=4,ensure_ascii=False)