From 1c083132a230495d0f9b7a8c1c1cf9c95666cf9c Mon Sep 17 00:00:00 2001 From: "Tanner Collin (aider)" Date: Tue, 3 Jun 2025 15:59:42 -0600 Subject: [PATCH] refactor: Preserve original metadata when embedding GUID --- swap_guids.py | 75 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 44 insertions(+), 31 deletions(-) diff --git a/swap_guids.py b/swap_guids.py index 7b925fa..7a09794 100644 --- a/swap_guids.py +++ b/swap_guids.py @@ -27,46 +27,59 @@ def modify_feed(context, feed): guid_text_to_embed = f" Guid: {new_guid_str}" source_path = article.source_path - - # Reconstruct metadata header from article.metadata - # article.metadata keys are typically lowercase. Capitalize them for convention. - metadata_header_lines = [] - for key, value in article.metadata.items(): - if isinstance(value, list): - # Convert list items to string and join with comma (e.g., for tags) - metadata_header_lines.append(f"{key.capitalize()}: {', '.join(map(str, value))}") - else: - # Ensure value is string for concatenation - metadata_header_lines.append(f"{key.capitalize()}: {str(value)}") - - # article._content holds the raw Markdown content string (after metadata parsing) + + # Ensure article object has the _content attribute if not hasattr(article, '_content'): log.error(f"Article '{article.title}' does not have '_content' attribute. Cannot embed Guid into source file.") - # This is a critical issue for the requested operation. raise Exception(f"Cannot find raw content for article '{article.title}' to embed Guid.") - markdown_body = article._content + # Read the original file content. + # Python's open() in text mode uses universal newlines by default, converting \r\n and \r to \n. + # Pelican's MarkdownReader also provides article._content with \n newlines. + try: + with open(source_path, 'r', encoding='utf-8') as f: + original_file_content_universal_newlines = f.read() + except Exception as e: + log.error(f"Failed to read original content from '{source_path}': {e}") + raise - # Split the markdown body into the first paragraph and the rest - # Paragraphs in Markdown are separated by one or more blank lines (\n\n) - parts = markdown_body.split('\n\n', 1) - first_paragraph_text = parts[0] - rest_of_body = parts[1] if len(parts) > 1 else "" + # Sanity check: the article's body content should be a suffix of the read file content. + if not original_file_content_universal_newlines.endswith(article._content): + log.error(f"Content mismatch for '{article.title}' in '{source_path}'. " + "The article's parsed content (article._content) does not match the " + "ending of the raw file (read with universal newlines). This is unexpected " + "and may indicate issues with file parsing or concurrent modifications.") + # For debugging, one might log tails of both strings here. + # log.debug(f"Tail of original file content: '{original_file_content_universal_newlines[-200:]}'") + # log.debug(f"Tail of article._content: '{article._content[-200:]}'") + raise Exception(f"Content boundary determination error for article '{article.title}'.") + + # Determine the metadata part by subtracting the length of the content body. + metadata_section_length = len(original_file_content_universal_newlines) - len(article._content) + metadata_part_from_file = original_file_content_universal_newlines[:metadata_section_length] - # Append the Guid text to the end of the first paragraph + # current_body_content is what Pelican parsed as the article's body. + current_body_content = article._content + + # Split this body content to find its first paragraph. + # Paragraphs in Markdown are separated by one or more blank lines (\n\n). + body_parts = current_body_content.split('\n\n', 1) + first_paragraph_of_body = body_parts[0] + rest_of_body_content = body_parts[1] if len(body_parts) > 1 else "" + + # Append the Guid text to the end of the first paragraph of the body. # .rstrip() removes any trailing whitespace/newlines from the paragraph itself before appending. - modified_first_paragraph = first_paragraph_text.rstrip() + guid_text_to_embed + modified_first_paragraph_of_body = first_paragraph_of_body.rstrip() + guid_text_to_embed - # Reconstruct the new markdown body - new_markdown_body = modified_first_paragraph - if rest_of_body: # Add back the rest of the body with the double newline separator - new_markdown_body += '\n\n' + rest_of_body + # Reconstruct the new body content with the embedded Guid. + new_body_content_with_guid = modified_first_paragraph_of_body + if rest_of_body_content: # Add back the rest of the body if it existed. + new_body_content_with_guid += '\n\n' + rest_of_body_content - # Combine metadata and new body to form the complete new file content - if metadata_header_lines: - full_new_content = "\n".join(metadata_header_lines) + "\n\n" + new_markdown_body - else: # No metadata, just the body - full_new_content = new_markdown_body + # Construct the full new file content by combining the original metadata part and the new body. + # This preserves the original metadata block verbatim (including comments, formatting, and original newline characters if any within it, + # as metadata_part_from_file is a direct slice from original_file_content_universal_newlines which has \n newlines). + full_new_content = metadata_part_from_file + new_body_content_with_guid try: with open(source_path, 'w', encoding='utf-8') as f: