I was searching for a way to strip out pictures from these file types and this is the solution I came up with. It iterates through a given directory structure, copies any files with the proper extension, and renames the copy to filename.zip. Then it navigates through the zip structure and extracts all picture type files with the proper extension, and renames them to the original file name, with a number for uniqueness. Finally, it deletes the extracted directory trees it created.
Extracting pictures from text documents is part of my job, so this will actually save my company thousands of hours in the long run.
All of the code is below, and what I'm really asking is: Is there a better way? Is there something more efficient? Can it be scaled to include other formats? Could the text be extracted into a txt - for loading times on word vs notepad?
This solution works on my Linux machine, and I can extract the pictures, but I've yet to test on a Windows system.
import shutil
import os
import zipfile
def zipDoc(aFile,dirPath):
dotNDX = aFile.index(".") # position of the .
shortFN = aFile[:dotNDX] # name of the file before .
zipName = dirPath + shortFN + ".zip" # name and path of the file only .zip
shutil.copy2(dirPath + aFile, zipName) # copies all data from original into .zip format
useZIP = zipfile.ZipFile(zipName) # the usable zip file
return useZIP # returns the zipped file
def hasPicExtension(aFile): # if a file ends in a typical picture file extension, returns true
picEndings = [".jpeg",".jpg",".png",".bmp",".JPEG"".JPG",".BMP",".PNG"] # list of photo extensions
if aFile.endswith(tuple(picEndings)): # turn the list into a tuple, because .endswith accepts that
return True
else: # if it doesn't end in a picture extension
return False
def delDOCXEvidence(somePath): # removes the .docx file structures generated
# Working Linux code:
os.rmdir(somePath + "/word/media") # removes directory
os.rmdir(somePath + "/word") # removes more directory
# Untested windows code:
# os.rmdir(somePath + "\\\\word\\\\media") # removes directory
# os.rmdir(somePath + "\\\\word") #removes more directory
def delXLSXEvidence(somePath): # removes the .xlsx file structures generated
# Working Linux code:
os.rmdir(somePath + "/xl/media") # removes directory
os.rmdir(somePath + "/xl") # removes more directory
# Untested windows code:
# os.rmdir(somePath + "\\\\xl\\\\media") # removes directory
# os.rmdir(somePath + "\\\\xl") #removes more directory
def extractPicsFromDir(dirPath=""):
# when given a directory path, will extract all images from all .docx and .xlsx file types
if os.path.isdir(dirPath): # if the given path is a directory
for dirFile in os.listdir(dirPath): # loops through all files in the directory
dirFileName = os.fsdecode(dirFile) # strips out the file name
if dirFileName.endswith(".docx"):
useZIP = zipDoc(dirFile,dirPath) # turns it into a zip
picNum = 1 # number of pictures in file
for zippedFile in useZIP.namelist(): # loops through all files in the directory
if hasPicExtension(zippedFile): # if it ends with photo
useZIP.extract(zippedFile, path=dirPath) # extracts the picture to the path + word/media/
shutil.move(dirPath + str(zippedFile),dirPath + dirFileName[:dirFileName.index(".")] + " - " + str(picNum)) # moves the picture out
picNum += 1
delDOCXEvidence(dirPath) # removes the extracted file structure
os.remove(useZIP.filename) # removes zip file
# no evidence
if dirFileName.endswith(".xlsx"):
useZIP = zipDoc(dirFile,dirPath) # turns it into a zip
picNum = 1 # number of pictures in file
for zippedFile in useZIP.namelist(): # loops through all files in the directory
if hasPicExtension(zippedFile): # if it ends with photo
useZIP.extract(zippedFile, path=dirPath) # extracts the picture to the path + word/media/
shutil.move(dirPath + str(zippedFile),dirPath + dirFileName[:dirFileName.index(".")] + " - " + str(picNum)) # moves the picture out
picNum += 1
delXLSXEvidence(dirPath) # removes the extracted file structure
os.remove(useZIP.filename) # removes zip file
# no evidence
print("Not a directory path!")
uDir = input("Enter your directory: ")
Excel files are in the form of zip file.It is easy to extract images from excel or docx file:
import zipfile
from PIL import Image, ImageFilter
import io
blur = ImageFilter.GaussianBlur(40)
def redact_images(filename,FilePath):
outfile = filename.replace(".xlsx", "_redacted.xlsx")
with zipfile.ZipFile(filename) as inzip:
with zipfile.ZipFile(outfile, "w") as outzip:
i = 0
for info in inzip.infolist():
name = info.filename
content = inzip.read(info)
if name.endswith((".png", ".jpeg", ".gif")):
fmt = name.split(".")[-1]
Name = name.split("/")[-1]
img = Image.open(io.BytesIO(content))
img.save(FilePath + str(Name))
outb = io.BytesIO()
img.save(outb, fmt)
content = outb.getvalue()
info.file_size = len(content)
info.CRC = zipfile.crc32(content)
i += 1
outzip.writestr(info, content)
filename : Location of input excel file
FilePath : Location to save extracted images