阅读量:0
import pandas as pd # 读取Excel文件 file_path = "量表编码.xlsx" data = pd.read_excel(file_path) # 假设多选题的列名为 'multiple_choice_question' multiple_choice_cols = [ "7、", "8、", "9、", "12、", "14、", "21、", "25、", ] # 处理多选题列,进行独热编码 def process_multiple_choice_columns(data, columns): for col in columns: if col in data.columns: col_index = data.columns.get_loc(col) multiple_choice_data = data[col].str.get_dummies("┋") data = data.drop(columns=[col]) for new_col in multiple_choice_data.columns: data.insert( col_index, f"{col}_{new_col}", multiple_choice_data[new_col] ) col_index += 1 return data # 处理多选题列 data = process_multiple_choice_columns(data, multiple_choice_cols) # 处理非数值型数据进行编码 def process_non_numeric_columns(data): non_numeric_cols = data.select_dtypes(include=["object"]).columns for col in non_numeric_cols: data[col] = pd.Categorical(data[col]).codes return data # 编码非数值型数据 data = process_non_numeric_columns(data) data_scaled = pd.DataFrame(data) data_scaled.to_excel("data_scaled.xlsx")