Skip to content

Commit

Permalink
Merge branch 'main' into update_commandline_help
Browse files Browse the repository at this point in the history
  • Loading branch information
sugatoray authored Dec 18, 2024
2 parents 2774800 + cb66b35 commit 6f3c762
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 12 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -162,3 +162,5 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
src/.DS_Store
.DS_Store
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# MarkItDown

[![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/)
![PyPI - Downloads](https://img.shields.io/pypi/dd/markitdown)



MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc).
It supports:
Expand Down
35 changes: 23 additions & 12 deletions src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -1133,27 +1133,33 @@ def convert(
extracted_zip_folder_name = (
f"extracted_{os.path.basename(local_path).replace('.zip', '_zip')}"
)
new_folder = os.path.normpath(
extraction_dir = os.path.normpath(
os.path.join(os.path.dirname(local_path), extracted_zip_folder_name)
)
md_content = f"Content from the zip file `{os.path.basename(local_path)}`:\n\n"

# Safety check for path traversal
if not new_folder.startswith(os.path.dirname(local_path)):
return DocumentConverterResult(
title=None, text_content=f"[ERROR] Invalid zip file path: {local_path}"
)

try:
# Extract the zip file
# Extract the zip file safely
with zipfile.ZipFile(local_path, "r") as zipObj:
zipObj.extractall(path=new_folder)
# Safeguard against path traversal
for member in zipObj.namelist():
member_path = os.path.normpath(os.path.join(extraction_dir, member))
if (
not os.path.commonprefix([extraction_dir, member_path])
== extraction_dir
):
raise ValueError(
f"Path traversal detected in zip file: {member}"
)

# Extract all files safely
zipObj.extractall(path=extraction_dir)

# Process each extracted file
for root, dirs, files in os.walk(new_folder):
for root, dirs, files in os.walk(extraction_dir):
for name in files:
file_path = os.path.join(root, name)
relative_path = os.path.relpath(file_path, new_folder)
relative_path = os.path.relpath(file_path, extraction_dir)

# Get file extension
_, file_extension = os.path.splitext(name)
Expand All @@ -1177,7 +1183,7 @@ def convert(

# Clean up extracted files if specified
if kwargs.get("cleanup_extracted", True):
shutil.rmtree(new_folder)
shutil.rmtree(extraction_dir)

return DocumentConverterResult(title=None, text_content=md_content.strip())

Expand All @@ -1186,6 +1192,11 @@ def convert(
title=None,
text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}",
)
except ValueError as ve:
return DocumentConverterResult(
title=None,
text_content=f"[ERROR] Security error in zip file {local_path}: {str(ve)}",
)
except Exception as e:
return DocumentConverterResult(
title=None,
Expand Down

0 comments on commit 6f3c762

Please sign in to comment.