阅读量:0
基于不依赖wind、某花顺等第三方平台数据的考虑,尝试直接从财报中解析三大报表进而计算ROE等财务指标,因此需要下载沪深两市的上市公司财报数据,便于后续从pdf中解析三大报表。
深市爬虫好做,先放深市爬虫:
''' 根据时间段下载深交所上市公司财报 path str 指定财报存储路径 time str 财报年度 如[2023,2024] stock_list list 下载财报的股票代码列表 例如['000001','000002'] financial_statements_type list 财报的类别 例如['annual','semi-annual','quarterly_1','quarterly_3'] 分别为年报、半年报、一季报、三季报 ''' def get_financial_statements(path, time, stock_list, financial_statements_type): url = "https://www.szse.cn/api/disc/announcement/annList" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3', 'Content-Type': 'application/json', 'Connection': 'close' } download_url = "https://disc.static.szse.cn/download" # 逐只股票读取相应pdf报表 for stock in stock_list: # 逐年循环 for year in time: # 根据财报类型逐个读取pdf for fs_type in financial_statements_type: if fs_type == 'annual': title = "年报" bigCategoryId = '010301' # 年报查询代码 timestart = str(year)+"-12-31" timeend = str(year+1)+"-09-01" # 防止出现财报更正之后时间节点覆盖不到,统一往后推三个月 elif fs_type == 'semi-annual': title = "中报" bigCategoryId = '010303' # 中报查询代码 timestart = str(year) + "-07-01" timeend = str(year) + "-12-31" elif fs_type == 'quarterly_1': title = "一季报" bigCategoryId = '010305' # 一季报查询代码 timestart = str(year) + "-04-01" timeend = str(year) + "-07-31" else: title = "三季报" bigCategoryId = '010307' # 三季报查询代码 timestart = str(year) + "-10-01" timeend = str(year) + "-12-31" data = { "seDate": [timestart, timeend], "stock": [stock], "channelCode": ["listedNotice_disc"], "bigCategoryId": [bigCategoryId], "pageSize": 50, "pageNum": 1 } response = requests.post(url=url, data=json.dumps(data), headers=headers) data = json.loads(response.text)["data"] if len(data) == 0 or data is None: print("警告:股票代码:"+stock+" "+str(year)+title+"不存在!") else: for entry in data: # 对摘要栏目做特殊处理 if entry['title'].find("报告摘要") < 0: # 检查path路径下stock代码文件夹、年份文件夹是否存在,不存在则创建 file_path = path+stock+"/"+str(year) if Tools.check_folder_exists(path+stock) == False: os.mkdir(path+stock) if Tools.check_folder_exists(file_path) == False: os.mkdir(file_path) file = file_path + "/" + str(year) + title + "##" + entry['title'].replace("*", "") + ".pdf" # 检查文件是否已存在,不存在再下载 if os.path.exists(file): print("警告:股票代码:" + stock + " " + str(year) + title + "已存在!") else: filecontent = requests.get(download_url + entry["attachPath"]) with open(file, "wb") as pdf: pdf.write(filecontent.content) print("股票代码:" + stock + " " + str(year) + title + "写入成功。") # 爬虫调用实例: # timestart = [2023,2024] # stock_list = ['000001','000002'] # financial_statements_type = ['annual', 'semi-annual', 'quarterly_1', 'quarterly_3'] # SZ_financial_statement_path = "F:/data/SZ/" # get_financial_statements(SZ_financial_statement_path, timestart,stock_list,financial_statements_type)
沪市爬虫:
''' 根据时间段下载上交所上市公司财报 time str 财报年度 如2024、2023 stock_list list 下载财报的股票代码列表 例如['000001','000002'] financial_statements_type list 财报的类别 例如['annual','semi-annual','quarterly_1','quarterly_3'] 分别为年报、半年报、一季报、三季报 ''' def get_financial_statements(path, time, stock_list, financial_statements_type): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3', 'Referer': 'https://www.sse.com.cn/', 'Connection': 'close' } download_url = "https://www.sse.com.cn" # 逐只股票读取相应pdf报表 for stock in stock_list: # 逐年循环 for year in time: # 根据财报类型逐个读取pdf for fs_type in financial_statements_type: if fs_type == 'annual': title = "年报" bigCategoryId = 'YEARLY' # 年报查询代码 timestart = str(year) + "-12-31" timeend = str(year + 1) + "-09-01" # 防止出现财报更正之后时间节点覆盖不到,统一往后推三个月 elif fs_type == 'semi-annual': title = "中报" bigCategoryId = 'QUATER2' # 中报查询代码 timestart = str(year) + "-07-01" timeend = str(year) + "-12-31" elif fs_type == 'quarterly_1': title = "一季报" bigCategoryId = 'QUATER1' # 一季报查询代码 timestart = str(year) + "-04-01" timeend = str(year) + "-07-31" else: title = "三季报" bigCategoryId = 'QUATER3' # 三季报查询代码 timestart = str(year) + "-10-01" timeend = str(year) + "-12-31" url = "https://query.sse.com.cn/security/stock/queryCompanyBulletin.do?jsonCallBack=jsonpCallback"+str(random.randint(10000, 999999))+"&isPagination=true&pageHelp.pageSize=50&pageHelp.pageNo=1&pageHelp.beginPage=1&pageHelp.cacheSize=1&pageHelp.endPage=1&productId="+stock+"&securityType=0101%2C120100%2C020100%2C020200%2C120200&reportType2=DQBG&reportType="+bigCategoryId+"&beginDate="+timestart+"&endDate="+timeend response = requests.get(url=url, headers=headers) datas = json.loads(response.text.split('"keyWord":null,"pageHelp":')[1].split(',"productId":')[0])['data'] if len(datas) == 0 or datas is None: print("警告:股票代码:" + stock + " " + str(year) + title + "不存在!") else: for entry in datas: # 对摘要栏目做特殊处理,去除摘要 if entry['TITLE'].find("摘要") < 0: # 检查path路径下stock代码文件夹、年份文件夹是否存在,不存在则创建 file_path = path + stock + "/" + str(year) if Tools.check_folder_exists(path + stock) == False: os.mkdir(path + stock) if Tools.check_folder_exists(file_path) == False: os.mkdir(file_path) file = file_path + "/" + str(year) + title + "##" + entry['TITLE'].replace("*", "") + ".pdf" # 检查文件是否已存在,不存在再下载 if os.path.exists(file): print("警告:股票代码:" + stock + " " + str(year) + title + "已存在!") else: filecontent = requests.get(download_url + entry["URL"]) with open(file, "wb") as pdf: pdf.write(filecontent.content) print("股票代码:" + stock + " " + str(year) + title + "写入成功。") # timestart = [2023] # stock_list = ['600011'] # financial_statements_type = ['annual', 'semi-annual', 'quarterly_1', 'quarterly_3'] # SZ_financial_statement_path = "F:/data/SH/" # get_financial_statements(SZ_financial_statement_path, timestart,stock_list,financial_statements_type)