搭建自己的金融数据源和量化分析平台（六）：下载并存储沪深两市上市公司财报_业界新闻

发布时间:2024-08-03 06:14

阅读量:0

基于不依赖wind、某花顺等第三方平台数据的考虑，尝试直接从财报中解析三大报表进而计算ROE等财务指标，因此需要下载沪深两市的上市公司财报数据，便于后续从pdf中解析三大报表。
深市爬虫好做，先放深市爬虫：

''' 根据时间段下载深交所上市公司财报 path str 指定财报存储路径 time str 财报年度 如[2023,2024] stock_list list 下载财报的股票代码列表 例如['000001','000002'] financial_statements_type list 财报的类别 例如['annual','semi-annual','quarterly_1','quarterly_3'] 分别为年报、半年报、一季报、三季报 ''' def get_financial_statements(path, time, stock_list, financial_statements_type):     url = "https://www.szse.cn/api/disc/announcement/annList"     headers = {         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',         'Content-Type': 'application/json',         'Connection': 'close'     }     download_url = "https://disc.static.szse.cn/download"     # 逐只股票读取相应pdf报表     for stock in stock_list:         # 逐年循环         for year in time:             # 根据财报类型逐个读取pdf             for fs_type in financial_statements_type:                 if fs_type == 'annual':                     title = "年报"                     bigCategoryId = '010301'   # 年报查询代码                     timestart = str(year)+"-12-31"                     timeend = str(year+1)+"-09-01"  # 防止出现财报更正之后时间节点覆盖不到，统一往后推三个月                 elif fs_type == 'semi-annual':                     title = "中报"                     bigCategoryId = '010303'   # 中报查询代码                     timestart = str(year) + "-07-01"                     timeend = str(year) + "-12-31"                 elif fs_type == 'quarterly_1':                     title = "一季报"                     bigCategoryId = '010305'   # 一季报查询代码                     timestart = str(year) + "-04-01"                     timeend = str(year) + "-07-31"                 else:                     title = "三季报"                     bigCategoryId = '010307'   # 三季报查询代码                     timestart = str(year) + "-10-01"                     timeend = str(year) + "-12-31"                 data = {                     "seDate": [timestart, timeend],                     "stock": [stock],                     "channelCode": ["listedNotice_disc"],                     "bigCategoryId": [bigCategoryId],                     "pageSize": 50,                     "pageNum": 1                 }                 response = requests.post(url=url, data=json.dumps(data), headers=headers)                 data = json.loads(response.text)["data"]                 if len(data) == 0 or data is None:                     print("警告:股票代码:"+stock+" "+str(year)+title+"不存在!")                 else:                     for entry in data:                         # 对摘要栏目做特殊处理                         if entry['title'].find("报告摘要") < 0:                             # 检查path路径下stock代码文件夹、年份文件夹是否存在，不存在则创建                             file_path = path+stock+"/"+str(year)                             if Tools.check_folder_exists(path+stock) == False:                                 os.mkdir(path+stock)                             if Tools.check_folder_exists(file_path) == False:                                 os.mkdir(file_path)                             file = file_path + "/" + str(year) + title + "##" + entry['title'].replace("*", "") + ".pdf"                             # 检查文件是否已存在，不存在再下载                             if os.path.exists(file):                                 print("警告:股票代码:" + stock + " " + str(year) + title + "已存在!")                             else:                                 filecontent = requests.get(download_url + entry["attachPath"])                                 with open(file, "wb") as pdf:                                     pdf.write(filecontent.content)                                 print("股票代码:" + stock + " " + str(year) + title + "写入成功。")  # 爬虫调用实例： # timestart = [2023,2024] # stock_list = ['000001','000002'] # financial_statements_type = ['annual', 'semi-annual', 'quarterly_1', 'quarterly_3'] # SZ_financial_statement_path = "F:/data/SZ/" # get_financial_statements(SZ_financial_statement_path, timestart,stock_list,financial_statements_type)

沪市爬虫：

''' 根据时间段下载上交所上市公司财报 time str 财报年度 如2024、2023 stock_list list 下载财报的股票代码列表 例如['000001','000002'] financial_statements_type list 财报的类别 例如['annual','semi-annual','quarterly_1','quarterly_3'] 分别为年报、半年报、一季报、三季报 ''' def get_financial_statements(path, time, stock_list, financial_statements_type):     headers = {         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',         'Referer': 'https://www.sse.com.cn/',         'Connection': 'close'     }     download_url = "https://www.sse.com.cn"     # 逐只股票读取相应pdf报表     for stock in stock_list:         # 逐年循环         for year in time:             # 根据财报类型逐个读取pdf             for fs_type in financial_statements_type:                 if fs_type == 'annual':                     title = "年报"                     bigCategoryId = 'YEARLY'  # 年报查询代码                     timestart = str(year) + "-12-31"                     timeend = str(year + 1) + "-09-01"  # 防止出现财报更正之后时间节点覆盖不到，统一往后推三个月                 elif fs_type == 'semi-annual':                     title = "中报"                     bigCategoryId = 'QUATER2'  # 中报查询代码                     timestart = str(year) + "-07-01"                     timeend = str(year) + "-12-31"                 elif fs_type == 'quarterly_1':                     title = "一季报"                     bigCategoryId = 'QUATER1'  # 一季报查询代码                     timestart = str(year) + "-04-01"                     timeend = str(year) + "-07-31"                 else:                     title = "三季报"                     bigCategoryId = 'QUATER3'  # 三季报查询代码                     timestart = str(year) + "-10-01"                     timeend = str(year) + "-12-31"                 url = "https://query.sse.com.cn/security/stock/queryCompanyBulletin.do?jsonCallBack=jsonpCallback"+str(random.randint(10000, 999999))+"&isPagination=true&pageHelp.pageSize=50&pageHelp.pageNo=1&pageHelp.beginPage=1&pageHelp.cacheSize=1&pageHelp.endPage=1&productId="+stock+"&securityType=0101%2C120100%2C020100%2C020200%2C120200&reportType2=DQBG&reportType="+bigCategoryId+"&beginDate="+timestart+"&endDate="+timeend                 response = requests.get(url=url, headers=headers)                 datas = json.loads(response.text.split('"keyWord":null,"pageHelp":')[1].split(',"productId":')[0])['data']                 if len(datas) == 0 or datas is None:                     print("警告:股票代码:" + stock + " " + str(year) + title + "不存在!")                 else:                     for entry in datas:                         # 对摘要栏目做特殊处理，去除摘要                         if entry['TITLE'].find("摘要") < 0:                             # 检查path路径下stock代码文件夹、年份文件夹是否存在，不存在则创建                             file_path = path + stock + "/" + str(year)                             if Tools.check_folder_exists(path + stock) == False:                                 os.mkdir(path + stock)                             if Tools.check_folder_exists(file_path) == False:                                 os.mkdir(file_path)                             file = file_path + "/" + str(year) + title + "##" + entry['TITLE'].replace("*", "") + ".pdf"                             # 检查文件是否已存在，不存在再下载                             if os.path.exists(file):                                 print("警告:股票代码:" + stock + " " + str(year) + title + "已存在!")                             else:                                 filecontent = requests.get(download_url + entry["URL"])                                 with open(file, "wb") as pdf:                                     pdf.write(filecontent.content)                                 print("股票代码:" + stock + " " + str(year) + title + "写入成功。") # timestart = [2023] # stock_list = ['600011'] # financial_statements_type = ['annual', 'semi-annual', 'quarterly_1', 'quarterly_3'] # SZ_financial_statement_path = "F:/data/SH/" # get_financial_statements(SZ_financial_statement_path, timestart,stock_list,financial_statements_type)