本文介绍了在R中写入数据帧时出错的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧! 问题描述 29岁程序员,3月因学历无情被辞! 我试图从我从OCR格式的pdf文件中提取的文本中搜索一个单词。这个pdf文件有多个页面,所以对于每个页面,我搜索那个单词,如果找到这个单词,然后写文件名,状态(存在或不存在) ,页面,找到了哪些字。但数据框是给所有文件的状态存在,我只是想这样的 file_name状态页字 test1.pdf现在test1_2,test1_4 gym,school test2.pdfNot Present - - test3.pdfPresenttest3_1 gym 这段代码中缺少的是什么。 这里是代码 $ $ p $ All_files = Sys.glob(*。pdf) v1< - numeric(length(All_files)) chk_words = c(游泳池,健身房,西,段)字< - 健身房 tc = c() ps = c b $ b $ = list() df< - data.frame() Status =Present for(i in seq_along(All_files)){ file_name< - All_files [i] cnt< - pdf_info(All_files [i])$ pages print(cnt) for(j in seq_len(cnt)){ img_file< - pdftools :: pdf_convert(All_files [i],format ='tiff',pages = j,dp i = 400) text ocr_text< - capture.output(cat(text))检查< - sapply(ocr_text,paste,collapse = ) junk file.remove(junk) br< (b)长度(其中(stri_detect_fixed(tolower(检查),tolower(word)))) else存在打印$ b $ (b)(b)(b)(b)(b)(b)(b)(b)如果(长度(其中,(stri_detect_fixed(tolower(check),tolower(k)))) if ==Present) ps = k x [[k]] = ps tc = unlist(unique(x))} $ b print(tc)状态< - if(v1 [i] == 0)Not PresentelsePresent pages paste0(tools :: file_pat h_sans_ext(basename(file_name)),_,v1 [i]) words df Status,pages = pages,words = words,tc)) } > 谢谢 这是一个单词的选项 v1< - 数字(长度(All_files))字< - school df< - data.frame() Status =存在 (我在seq_along(All_files)){ file_name< - All_files [i] cnt< - pdf_info(All_files [i])$ pages print(cnt) for(j in seq_len(cnt)){ img_file< - pdftools :: pdf_convert(All_files [i],format ='tiff', pages = j,dpi = 400) text ocr_text< - capture.output(cat(text)) check< - sapply(ocr_text,p aste,collapse =) junk file.remove(junk) br < -if(长度(其中(stri_detect_fixed(tolower(检查),tolower(word)))) else存在 print(br) if if(br ==Present){ v1 [i] break} } Status< - if(v1 [i] == 0)Not PresentelsePresent pages paste0(tools :: file_path_sans_ext(basename(file_name)),_,v1 [i]) words df Status,pages = pages,words = words)) } $输出 df # file_name状态页单词#1 Amenities.pdf不存在 - - #2 test.pdf现在test_2学校 I'm trying to search a word from the text that I extract from the pdf file which is OCR'd format. This pdf file has multiple pages, so for each page, I'm searching that word, if that word is found then write the filename, status(Present or Not Present),Page on which it is found and what words it has found to a dataframe . But the dataframe is giving the status "Present" for all files, I just want like thisfile_name Status Page wordstest1.pdf "Present" test1_2,test1_4 gym,schooltest2.pdf "Not Present" - -test3.pdf "Present" test3_1 gymwhat m I missing in this code.here is the code All_files=Sys.glob("*.pdf")v1 <- numeric(length(All_files))chk_words=c("Swimming pool","Gym","west","para")word <- "Gym"tc=c()ps=c()x=list()df <- data.frame()Status="Present"for (i in seq_along(All_files)){ file_name <- All_files[i] cnt <- pdf_info(All_files[i])$pages print(cnt) for(j in seq_len(cnt)){ img_file <- pdftools::pdf_convert(All_files[i], format = 'tiff', pages = j, dpi = 400) text <- ocr(img_file) ocr_text <- capture.output(cat(text)) check <- sapply(ocr_text, paste, collapse="") junk <- dir(path="D:/Deepesh/R Script/All_PDF_Files/Registration_Certificates_OCR", pattern="tiff") file.remove(junk) br <-if(length(which(stri_detect_fixed(tolower(check),tolower(word)))) <= 0) "Not Present" else "Present" print(br) if(br=="Present") { v1[i] <- j break} for(k in chk_words){ br=if(length(which(stri_detect_fixed(tolower(check),tolower(k)))) <= 0){ print("Not Present") } else {print("Present")} if(br == "Present") ps=k x[[k]]=ps tc=unlist(unique(x)) } } print(tc) Status <- if(v1[i] == 0) "Not Present" else "Present" pages <- if(v1[i] == 0) "-" else paste0(tools::file_path_sans_ext(basename(file_name)), "_", v1[i]) words <- if(v1[i] == 0) "-" else word df <- rbind(df, cbind(file_name = basename(file_name), Status, pages = pages, words = words,tc))}Any suggestion is appreciable.Thanks 解决方案 Here is an option for single wordv1 <- numeric(length(All_files))word <- "school"df <- data.frame()Status="Present"for (i in seq_along(All_files)){ file_name <- All_files[i] cnt <- pdf_info(All_files[i])$pages print(cnt) for(j in seq_len(cnt)){ img_file <- pdftools::pdf_convert(All_files[i], format = 'tiff', pages = j, dpi = 400) text <- ocr(img_file) ocr_text <- capture.output(cat(text)) check <- sapply(ocr_text, paste, collapse="") junk <- dir(path= paste0(path, "/tiff"), pattern="tiff") file.remove(junk) br <-if(length(which(stri_detect_fixed(tolower(check),tolower(word)))) <= 0) "Not Present" else "Present" print(br) if(br=="Present") { v1[i] <- j break} } Status <- if(v1[i] == 0) "Not Present" else "Present" pages <- if(v1[i] == 0) "-" else paste0(tools::file_path_sans_ext(basename(file_name)), "_", v1[i]) words <- if(v1[i] == 0) "-" else word df <- rbind(df, cbind(file_name = basename(file_name), Status, pages = pages, words = words))}-outputdf# file_name Status pages words#1 Amenities.pdf Not Present - -#2 test.pdf Present test_2 school 这篇关于在R中写入数据帧时出错的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持! 上岸,阿里云!
08-20 15:01