从表格图像中提取单个字段以使用 OCR 进行 Excel

2022-01-09 00:00:00 python opencv computer-vision image-processing ocr

问题描述

我已经扫描了具有如下图所示表格的图像:

I have scanned images which have tables as shown in this image:

我正在尝试分别提取每个框并执行 OCR，但是当我尝试检测水平线和垂直线然后检测框时，它会返回以下图像:

I am trying to extract each box separately and perform OCR but when I try to detect horizontal and vertical lines and then detect boxes it's returning the following image:

当我尝试执行其他转换来检测文本(腐蚀和扩张)时，仍然会出现一些剩余的线条，如下所示:

And when I try to perform other transformations to detect text (erode and dilate) some remains of lines are still coming along with text like below:

我无法检测到仅用于执行 OCR 的文本，并且未生成正确的边界框，如下所示:

I cannot detect text only to perform OCR and proper bounding boxes aren't being generated like below:

我无法使用实线得到清晰分隔的框，我已经在一个用paint(如下所示)编辑的图像上尝试了这个以添加数字并且它有效.

I cannot get clearly separated boxes using real lines, I've tried this on an image that was edited in paint(as shown below) to add digits and it works.

我不知道我做错了哪一部分，但如果有什么我应该尝试或更改/添加我的问题，请告诉我.

I don't know which part I'm doing wrong but if there's anything I should try or maybe change/add in my question please please tell me.

#Loading all required libraries %pylab inline import cv2 import numpy as np import pandas as pd import pytesseract import matplotlib.pyplot as plt import statistics from time import sleep import random img = cv2.imread('images/scan1.jpg',0) # for adding border to an image img1= cv2.copyMakeBorder(img,50,50,50,50,cv2.BORDER_CONSTANT,value=[255,255]) # Thresholding the image (thresh, th3) = cv2.threshold(img1, 255, 255,cv2.THRESH_BINARY|cv2.THRESH_OTSU) # to flip image pixel values th3 = 255-th3 # initialize kernels for table boundaries detections if(th3.shape[0]<1000): ver = np.array([[1], [1], [1], [1], [1], [1], [1]]) hor = np.array([[1,1,1,1,1,1]]) else: ver = np.array([[1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1]]) hor = np.array([[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]]) # to detect vertical lines of table borders img_temp1 = cv2.erode(th3, ver, iterations=3) verticle_lines_img = cv2.dilate(img_temp1, ver, iterations=3) # to detect horizontal lines of table borders img_hor = cv2.erode(th3, hor, iterations=3) hor_lines_img = cv2.dilate(img_hor, hor, iterations=4) # adding horizontal and vertical lines hor_ver = cv2.add(hor_lines_img,verticle_lines_img) hor_ver = 255-hor_ver # subtracting table borders from image temp = cv2.subtract(th3,hor_ver) temp = 255-temp #Doing xor operation for erasing table boundaries tt = cv2.bitwise_xor(img1,temp) iii = cv2.bitwise_not(tt) tt1=iii.copy() #kernel initialization ver1 = np.array([[1,1], [1,1], [1,1], [1,1], [1,1], [1,1], [1,1], [1,1], [1,1]]) hor1 = np.array([[1,1,1,1,1,1,1,1,1,1], [1,1,1,1,1,1,1,1,1,1]]) #morphological operation temp1 = cv2.erode(tt1, ver1, iterations=2) verticle_lines_img1 = cv2.dilate(temp1, ver1, iterations=1) temp12 = cv2.erode(tt1, hor1, iterations=1) hor_lines_img2 = cv2.dilate(temp12, hor1, iterations=1) # doing or operation for detecting only text part and removing rest all hor_ver = cv2.add(hor_lines_img2,verticle_lines_img1) dim1 = (hor_ver.shape[1],hor_ver.shape[0]) dim = (hor_ver.shape[1]*2,hor_ver.shape[0]*2) # resizing image to its double size to increase the text size resized = cv2.resize(hor_ver, dim, interpolation = cv2.INTER_AREA) #bitwise not operation for fliping the pixel values so as to apply morphological operation such as dilation and erode want = cv2.bitwise_not(resized) if(want.shape[0]<1000): kernel1 = np.array([[1,1,1]]) kernel2 = np.array([[1,1], [1,1]]) kernel3 = np.array([[1,0,1],[0,1,0], [1,0,1]]) else: kernel1 = np.array([[1,1,1,1,1,1]]) kernel2 = np.array([[1,1,1,1,1], [1,1,1,1,1], [1,1,1,1,1], [1,1,1,1,1]]) tt1 = cv2.dilate(want,kernel1,iterations=2) # getting image back to its original size resized1 = cv2.resize(tt1, dim1, interpolation = cv2.INTER_AREA) # Find contours for image, which will detect all the boxes contours1, hierarchy1 = cv2.findContours(resized1, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) #function to sort contours by its x-axis (top to bottom) def sort_contours(cnts, method="left-to-right"): # initialize the reverse flag and sort index reverse = False i = 0 # handle if we need to sort in reverse if method == "right-to-left" or method == "bottom-to-top": reverse = True # handle if we are sorting against the y-coordinate rather than # the x-coordinate of the bounding box if method == "top-to-bottom" or method == "bottom-to-top": i = 1 # construct the list of bounding boxes and sort them from top to # bottom boundingBoxes = [cv2.boundingRect(c) for c in cnts] (cnts, boundingBoxes) = zip(*sorted(zip(cnts, boundingBoxes), key=lambda b:b[1][i], reverse=reverse)) # return the list of sorted contours and bounding boxes return (cnts, boundingBoxes) #sorting contours by calling fuction (cnts, boundingBoxes) = sort_contours(contours1, method="top-to-bottom") #storing value of all bouding box height heightlist=[] for i in range(len(boundingBoxes)): heightlist.append(boundingBoxes[i][3]) #sorting height values heightlist.sort() sportion = int(.5*len(heightlist)) eportion = int(0.05*len(heightlist)) #taking 50% to 95% values of heights and calculate their mean #this will neglect small bounding box which are basically noise try: medianheight = statistics.mean(heightlist[-sportion:-eportion]) except: medianheight = statistics.mean(heightlist[-sportion:-2]) #keeping bounding box which are having height more then 70% of the mean height and deleting all those value where # ratio of width to height is less then 0.9 box =[] imag = iii.copy() for i in range(len(cnts)): cnt = cnts[i] x,y,w,h = cv2.boundingRect(cnt) if(h>=.7*medianheight and w/h > 0.9): image = cv2.rectangle(imag,(x+4,y-2),(x+w-5,y+h),(0,255,0),1) box.append([x,y,w,h]) # to show image ###Now we have badly detected boxes image as shown

解决方案

你在正确的轨道上.这是您的方法的延续，稍作修改.这个想法是:

You're on the right track. Here's a continuation of your approach with slight modifications. The idea is:

获取二值图像.加载图像，转灰度，大津阈值.

Obtain binary image. Load image, convert to grayscale, and Otsu's threshold.

删除所有字符文本轮廓.我们创建一个矩形内核并执行打开以仅保留水平/垂直线.这将有效地使文本变成微小的噪声，因此我们找到轮廓并使用轮廓区域进行过滤以去除它们.

Remove all character text contours. We create a rectangular kernel and perform opening to only keep the horizontal/vertical lines. This will effectively make the text into tiny noise so we find contours and filter using contour area to remove them.

修复水平/垂直线并提取每个 ROI. 我们变形接近修复和断线并平滑表格.从这里我们使用带有 top-to-bottom 参数的 imutils.sort_contours() 对框域轮廓进行排序.接下来我们找到轮廓并使用轮廓区域进行过滤，然后提取每个 ROI.

Repair horizontal/vertical lines and extract each ROI. We morph close to fix and broken lines and smooth the table. From here we sort the box field contours using imutils.sort_contours() with the top-to-bottom parameter. Next we find contours and filter using contour area then extract each ROI.

<小时>
这是每个框字段和提取的 ROI 的可视化

Here's a visualization of each box field and the extracted ROI

代码

import cv2 import numpy as np from imutils import contours # Load image, grayscale, Otsu's threshold image = cv2.imread('1.jpg') original = image.copy() gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] # Remove text characters with morph open and contour filtering kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3)) opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=1) cnts = cv2.findContours(opening, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) cnts = cnts[0] if len(cnts) == 2 else cnts[1] for c in cnts: area = cv2.contourArea(c) if area < 500: cv2.drawContours(opening, [c], -1, (0,0,0), -1) # Repair table lines, sort contours, and extract ROI close = 255 - cv2.morphologyEx(opening, cv2.MORPH_CLOSE, kernel, iterations=1) cnts = cv2.findContours(close, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) cnts = cnts[0] if len(cnts) == 2 else cnts[1] (cnts, _) = contours.sort_contours(cnts, method="top-to-bottom") for c in cnts: area = cv2.contourArea(c) if area < 25000: x,y,w,h = cv2.boundingRect(c) cv2.rectangle(image, (x, y), (x + w, y + h), (36,255,12), -1) ROI = original[y:y+h, x:x+w] # Visualization cv2.imshow('image', image) cv2.imshow('ROI', ROI) cv2.waitKey(20) cv2.imshow('opening', opening) cv2.imshow('close', close) cv2.imshow('image', image) cv2.waitKey()

相关文章