📅  最后修改于: 2022-03-11 14:45:04.489000             🧑  作者: Mango
import pandas as pd
import pyreadstat
filename = 'foo.SAS7BDAT'
CHUNKSIZE = 50000
offset = 0
allChunk,_ = getChunk(row['filePath'], row_limit=CHUNKSIZE, row_offset=offset)
allChunk = allChunk.astype('category')
while True:
offset += CHUNKSIZE
# for xpt data, use pyreadstat.read_xpt()
chunk, _ = pyreadstat.read_sas7bdat(filename, row_limit=CHUNKSIZE, row_offset=offset)
if chunk.empty: break # if chunk is empty, it means the entire data has been read, so break
for eachCol in chunk: #converting each column to categorical
colUnion = pd.api.types.union_categoricals([allChunk[eachCol], chunk[eachCol]])
allChunk[eachCol] = pd.Categorical(allChunk[eachCol], categories=colUnion.categories)
chunk[eachCol] = pd.Categorical(chunk[eachCol], categories=colUnion.categories)
allChunk = pd.concat([allChunk, chunk]) #Append each chunk to the resulting dataframe