正则表达式符号含义
\s 匹配任何空白字符,包括空格、制表符(\t)、换行符(\n)、回车符(\r)、垂直制表符(\v)和换页符(\f)。
\S 是 \s 的否定形式,匹配任何非空白字符。
\d 匹配任何数字字符,等价于 [0-9]。
. 表示任意字符
* 表示0个或多个字符
+ 表示1个或多个字符
文本提取 示例代码
import re
import sysfrom utils.ocr_images_pdf import cno_ocr
def extract_date(text):date_pattern = r'\d{4}年\d{1,2}月\d{1,2}日'match = re.search(date_pattern, text) return match.group(0) if match else None
def extract_department_level(text):level_pattern = r'市民政局.*?为(.+?)。'match = re.search(level_pattern, text)return match.group(1) if match else None
def extract_duties(text):'''''''''\s 匹配任何空白字符,包括空格、制表符(\t)、换行符(\n)、回车符(\r)、垂直制表符(\v)和换页符(\f)。\S 是 \s 的否定形式,匹配任何非空白字符。\d 匹配任何数字字符,等价于 [0-9]。. 表示任意字符* 表示0个或多个字符+ 表示1个或多个字符'''duties_pattern = r'主要职责是:([\s\S]*?)第四条'match = re.search(duties_pattern, text)if match:duties_text = match.group(1).strip()duties_text = duties_text.replace('(', '(').replace(')', ')')delimiters = ['(一)', '(二)', '(三)', '(四)', '(五)','(六)', '(七)', '(八)', '(九)', '(十)','(十一)', '(十二)', '(十三)', '(十四)','(十五)', '(十六)']duties_list = []start_idx = 0for delimiter in delimiters:next_idx = duties_text.find(delimiter, start_idx)if next_idx == -1:breakduty_content = duties_text[start_idx:next_idx].strip()if duty_content:duties_list.append(duty_content)start_idx = next_idxlast_duty_content = duties_text[start_idx:].strip()if last_duty_content:duties_list.append(last_duty_content)formatted_duties = [duty.strip() for duty in duties_list if duty.strip()]return formatted_dutiesreturn []
def extract_institutions(text):institutions_pattern = r'第四条市民政局设下列内设机构:([\s\S]*?)第五条'match = re.search(institutions_pattern, text)if match:institutions_text = match.group(1).strip()institutions_text = institutions_text.replace('(', '(').replace(')', ')')delimiters = ['(一)', '(二)', '(三)', '(四)', '(五)','(六)', '(七)', '(八)', '(九)', '(十)','(十一)', '(十二)', '(十三)', '(十四)','(十五)', '(十六)']institutions_dict = {}start_idx = 0for delimiter in delimiters:next_idx = institutions_text.find(delimiter, start_idx)if next_idx == -1:breakinstitution_content = institutions_text[start_idx:next_idx].strip() if institution_content:parts = institution_content.split('。', 1)if len(parts) == 2:name, desc = partsinstitutions_dict[name.strip()] = desc.strip()else:name = parts[0].strip()institutions_dict[name] = ''start_idx = next_idx + len(delimiter)last_institution_content = institutions_text[start_idx:].strip()if last_institution_content:parts = last_institution_content.split('。', 1)if len(parts) == 2:name, desc = partsinstitutions_dict[name.strip()] = desc.strip()else:name = parts[0].strip()institutions_dict[name] = ''formatted_institutions = {k: v.strip() for k, v in institutions_dict.items() if k.strip()}return formatted_institutionsreturn {}class db_sql:"""SQL语句'INSERT INTO "compare_department_duty_summary"(department_name,department_duty,act_id,\instance_id,summary_text,score,type, create_time, id) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)'"""department_duty_insert_sql = 'INSERT INTO "department_duty"(department_name,department_duty) VALUES (%s,%s)'def extract_department_duty():filePath = ''save_dir = '../data/tmp_check_data/tem_test/'text = cno_ocr(filePath, save_dir)date = extract_date(text)department_level = extract_department_level(text)duties = extract_duties(text)institutions = extract_institutions(text)compilation = extract_compilation(text)if "__main__" == __name__:extract_department_duty()