refactor: extract model to module and add image sorting script

Co-authored-by: aider (gemini/gemini-2.5-pro-preview-05-06) <aider@aider.chat>
2025-07-31 16:22:46 -06:00
parent 587f38b9ce
commit 5f18d8bce2
3 changed files with 158 additions and 68 deletions
--- a/model.py
+++ b/model.py
@@ -0,0 +1,72 @@
+import torch.nn as nn
+from PIL import Image, ImageDraw
+
+# Custom transform to crop a triangle from the lower right corner
+class CropLowerRightTriangle(object):
+    """
+    Crops a rectangular area from the lower right corner of an image,
+    then masks it to a triangle.
+    The user can adjust the geometry of the triangle.
+    """
+    def __init__(self, triangle_width, triangle_height):
+        self.triangle_width = triangle_width
+        self.triangle_height = triangle_height
+
+    def __call__(self, img):
+        img_width, img_height = img.size
+
+        # Define the bounding box for the crop
+        left = img_width - self.triangle_width
+        top = img_height - self.triangle_height
+        right = img_width
+        bottom = img_height
+
+        # Crop a rectangle from the lower right corner
+        cropped_img = img.crop((left, top, right, bottom))
+
+        # Create a triangular mask. The mask is the same size as the cropped rectangle.
+        mask = Image.new('L', (self.triangle_width, self.triangle_height), 0)
+        # The polygon vertices define the lower-right triangle within the rectangle.
+        # Vertices are (top-right, bottom-left, bottom-right).
+        polygon = [(self.triangle_width, 0), (0, self.triangle_height), (self.triangle_width, self.triangle_height)]
+        ImageDraw.Draw(mask).polygon(polygon, fill=255)
+
+        # Create a black background image.
+        background = Image.new("RGB", cropped_img.size, (0, 0, 0))
+        
+        # Paste the original cropped image onto the background using the mask.
+        # Where the mask is white, the image is pasted. Where black, it's not.
+        background.paste(cropped_img, (0, 0), mask)
+        
+        return background
+
+# Define the CNN
+class GarageDoorCNN(nn.Module):
+    def __init__(self, resize_dim=64):
+        super(GarageDoorCNN, self).__init__()
+        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
+        self.relu1 = nn.ReLU()
+        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
+        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
+        self.relu2 = nn.ReLU()
+        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
+        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
+        self.relu3 = nn.ReLU()
+        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
+        
+        # Calculate the size of the flattened features after convolutions and pooling
+        final_dim = resize_dim // (2**3) # 3 pooling layers with stride 2
+        self.fc1_input_features = 64 * final_dim * final_dim
+        
+        self.fc1 = nn.Linear(self.fc1_input_features, 512)
+        self.relu4 = nn.ReLU()
+        self.fc2 = nn.Linear(512, 2) # 2 classes: open, closed
+
+    def forward(self, x):
+        x = self.pool1(self.relu1(self.conv1(x)))
+        x = self.pool2(self.relu2(self.conv2(x)))
+        x = self.pool3(self.relu3(self.conv3(x)))
+        x = x.view(-1, self.fc1_input_features) # Flatten the tensor
+        x = self.relu4(self.fc1(x))
+        x = self.fc2(x)
+        return x
--- a/sort.py
+++ b/sort.py
@@ -0,0 +1,85 @@
+import torch
+import torch.nn.functional as F
+from torchvision import transforms
+from PIL import Image
+import os
+import shutil
+
+from model import CropLowerRightTriangle, GarageDoorCNN
+
+def sort_images():
+    # --- Configuration ---
+    MODEL_PATH = 'garage_door_cnn.pth'
+    SOURCE_DIR = 'data/hourly_photos/'
+    DEST_DIR = 'data/sorted/open/'
+    
+    # These must match the parameters used during training
+    TRIANGLE_CROP_WIDTH = 556
+    TRIANGLE_CROP_HEIGHT = 1184
+    RESIZE_DIM = 64
+
+    # The classes are sorted alphabetically by ImageFolder: ['closed', 'open']
+    CLASS_NAMES = ['closed', 'open']
+    TARGET_CLASS = 'open'
+    TARGET_CLASS_IDX = CLASS_NAMES.index(TARGET_CLASS)
+
+    # --- Setup ---
+    # Create destination directory if it doesn't exist
+    os.makedirs(DEST_DIR, exist_ok=True)
+
+    # Set up device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+
+    # Load model
+    model = GarageDoorCNN(resize_dim=RESIZE_DIM)
+    model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
+    model.to(device)
+    model.eval()
+
+    # Define image transforms
+    data_transform = transforms.Compose([
+        CropLowerRightTriangle(triangle_width=TRIANGLE_CROP_WIDTH, triangle_height=TRIANGLE_CROP_HEIGHT),
+        transforms.Resize((RESIZE_DIM, RESIZE_DIM)),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+    ])
+
+    # --- Process Images ---
+    print(f"Scanning images in {SOURCE_DIR}...")
+    with torch.no_grad():
+        for filename in os.listdir(SOURCE_DIR):
+            file_path = os.path.join(SOURCE_DIR, filename)
+            if os.path.isfile(file_path):
+                try:
+                    image = Image.open(file_path).convert('RGB')
+                    
+                    # Apply transformations
+                    input_tensor = data_transform(image)
+                    input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model
+                    input_batch = input_batch.to(device)
+
+                    # Get model output
+                    output = model(input_batch)
+                    
+                    # Get probabilities and prediction
+                    probabilities = F.softmax(output, dim=1)
+                    confidence, pred_idx = torch.max(probabilities, 1)
+                    
+                    if pred_idx.item() == TARGET_CLASS_IDX:
+                        print(f"Found 'open' image: {file_path} with confidence: {confidence.item():.4f}")
+                        # Copy file
+                        shutil.copy(file_path, os.path.join(DEST_DIR, filename))
+
+                except Exception as e:
+                    print(f"Could not process file {file_path}: {e}")
+    
+    print("Sorting complete.")
+
+if __name__ == '__main__':
+    if not os.path.exists('garage_door_cnn.pth'):
+        print("Error: Model file 'garage_door_cnn.pth' not found. Please run train.py first.")
+    elif not os.path.isdir('data/hourly_photos'):
+        print("Error: Source directory 'data/hourly_photos' not found.")
+    else:
+        sort_images()
--- a/train.py
+++ b/train.py
@@ -6,75 +6,8 @@ from torchvision import datasets, transforms
 from PIL import Image, ImageDraw
 import os

-# Custom transform to crop a triangle from the lower right corner
-class CropLowerRightTriangle(object):
-    """
-    Crops a rectangular area from the lower right corner of an image,
-    then masks it to a triangle.
-    The user can adjust the geometry of the triangle.
-    """
-    def __init__(self, triangle_width, triangle_height):
-        self.triangle_width = triangle_width
-        self.triangle_height = triangle_height
+from model import CropLowerRightTriangle, GarageDoorCNN

-    def __call__(self, img):
-        img_width, img_height = img.size
-
-        # Define the bounding box for the crop
-        left = img_width - self.triangle_width
-        top = img_height - self.triangle_height
-        right = img_width
-        bottom = img_height
-
-        # Crop a rectangle from the lower right corner
-        cropped_img = img.crop((left, top, right, bottom))
-
-        # Create a triangular mask. The mask is the same size as the cropped rectangle.
-        mask = Image.new('L', (self.triangle_width, self.triangle_height), 0)
-        # The polygon vertices define the lower-right triangle within the rectangle.
-        # Vertices are (top-right, bottom-left, bottom-right).
-        polygon = [(self.triangle_width, 0), (0, self.triangle_height), (self.triangle_width, self.triangle_height)]
-        ImageDraw.Draw(mask).polygon(polygon, fill=255)
-
-        # Create a black background image.
-        background = Image.new("RGB", cropped_img.size, (0, 0, 0))
-        
-        # Paste the original cropped image onto the background using the mask.
-        # Where the mask is white, the image is pasted. Where black, it's not.
-        background.paste(cropped_img, (0, 0), mask)
-        
-        return background
-
-# Define the CNN
-class GarageDoorCNN(nn.Module):
-    def __init__(self, resize_dim=64):
-        super(GarageDoorCNN, self).__init__()
-        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
-        self.relu1 = nn.ReLU()
-        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
-        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
-        self.relu2 = nn.ReLU()
-        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
-        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
-        self.relu3 = nn.ReLU()
-        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
-        
-        # Calculate the size of the flattened features after convolutions and pooling
-        final_dim = resize_dim // (2**3) # 3 pooling layers with stride 2
-        self.fc1_input_features = 64 * final_dim * final_dim
-        
-        self.fc1 = nn.Linear(self.fc1_input_features, 512)
-        self.relu4 = nn.ReLU()
-        self.fc2 = nn.Linear(512, 2) # 2 classes: open, closed
-
-    def forward(self, x):
-        x = self.pool1(self.relu1(self.conv1(x)))
-        x = self.pool2(self.relu2(self.conv2(x)))
-        x = self.pool3(self.relu3(self.conv3(x)))
-        x = x.view(-1, self.fc1_input_features) # Flatten the tensor
-        x = self.relu4(self.fc1(x))
-        x = self.fc2(x)
-        return x

 def train_model():
    # --- Hyperparameters and Configuration ---