refactor: extract model to module and add image sorting script
Co-authored-by: aider (gemini/gemini-2.5-pro-preview-05-06) <aider@aider.chat>
This commit is contained in:
72
model.py
Normal file
72
model.py
Normal file
@@ -0,0 +1,72 @@
|
||||
import torch.nn as nn
|
||||
from PIL import Image, ImageDraw
|
||||
|
||||
# Custom transform to crop a triangle from the lower right corner
|
||||
class CropLowerRightTriangle(object):
|
||||
"""
|
||||
Crops a rectangular area from the lower right corner of an image,
|
||||
then masks it to a triangle.
|
||||
The user can adjust the geometry of the triangle.
|
||||
"""
|
||||
def __init__(self, triangle_width, triangle_height):
|
||||
self.triangle_width = triangle_width
|
||||
self.triangle_height = triangle_height
|
||||
|
||||
def __call__(self, img):
|
||||
img_width, img_height = img.size
|
||||
|
||||
# Define the bounding box for the crop
|
||||
left = img_width - self.triangle_width
|
||||
top = img_height - self.triangle_height
|
||||
right = img_width
|
||||
bottom = img_height
|
||||
|
||||
# Crop a rectangle from the lower right corner
|
||||
cropped_img = img.crop((left, top, right, bottom))
|
||||
|
||||
# Create a triangular mask. The mask is the same size as the cropped rectangle.
|
||||
mask = Image.new('L', (self.triangle_width, self.triangle_height), 0)
|
||||
# The polygon vertices define the lower-right triangle within the rectangle.
|
||||
# Vertices are (top-right, bottom-left, bottom-right).
|
||||
polygon = [(self.triangle_width, 0), (0, self.triangle_height), (self.triangle_width, self.triangle_height)]
|
||||
ImageDraw.Draw(mask).polygon(polygon, fill=255)
|
||||
|
||||
# Create a black background image.
|
||||
background = Image.new("RGB", cropped_img.size, (0, 0, 0))
|
||||
|
||||
# Paste the original cropped image onto the background using the mask.
|
||||
# Where the mask is white, the image is pasted. Where black, it's not.
|
||||
background.paste(cropped_img, (0, 0), mask)
|
||||
|
||||
return background
|
||||
|
||||
# Define the CNN
|
||||
class GarageDoorCNN(nn.Module):
|
||||
def __init__(self, resize_dim=64):
|
||||
super(GarageDoorCNN, self).__init__()
|
||||
self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
|
||||
self.relu1 = nn.ReLU()
|
||||
self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
|
||||
self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
|
||||
self.relu2 = nn.ReLU()
|
||||
self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
|
||||
self.conv3 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
|
||||
self.relu3 = nn.ReLU()
|
||||
self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
|
||||
|
||||
# Calculate the size of the flattened features after convolutions and pooling
|
||||
final_dim = resize_dim // (2**3) # 3 pooling layers with stride 2
|
||||
self.fc1_input_features = 64 * final_dim * final_dim
|
||||
|
||||
self.fc1 = nn.Linear(self.fc1_input_features, 512)
|
||||
self.relu4 = nn.ReLU()
|
||||
self.fc2 = nn.Linear(512, 2) # 2 classes: open, closed
|
||||
|
||||
def forward(self, x):
|
||||
x = self.pool1(self.relu1(self.conv1(x)))
|
||||
x = self.pool2(self.relu2(self.conv2(x)))
|
||||
x = self.pool3(self.relu3(self.conv3(x)))
|
||||
x = x.view(-1, self.fc1_input_features) # Flatten the tensor
|
||||
x = self.relu4(self.fc1(x))
|
||||
x = self.fc2(x)
|
||||
return x
|
85
sort.py
85
sort.py
@@ -0,0 +1,85 @@
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torchvision import transforms
|
||||
from PIL import Image
|
||||
import os
|
||||
import shutil
|
||||
|
||||
from model import CropLowerRightTriangle, GarageDoorCNN
|
||||
|
||||
def sort_images():
|
||||
# --- Configuration ---
|
||||
MODEL_PATH = 'garage_door_cnn.pth'
|
||||
SOURCE_DIR = 'data/hourly_photos/'
|
||||
DEST_DIR = 'data/sorted/open/'
|
||||
|
||||
# These must match the parameters used during training
|
||||
TRIANGLE_CROP_WIDTH = 556
|
||||
TRIANGLE_CROP_HEIGHT = 1184
|
||||
RESIZE_DIM = 64
|
||||
|
||||
# The classes are sorted alphabetically by ImageFolder: ['closed', 'open']
|
||||
CLASS_NAMES = ['closed', 'open']
|
||||
TARGET_CLASS = 'open'
|
||||
TARGET_CLASS_IDX = CLASS_NAMES.index(TARGET_CLASS)
|
||||
|
||||
# --- Setup ---
|
||||
# Create destination directory if it doesn't exist
|
||||
os.makedirs(DEST_DIR, exist_ok=True)
|
||||
|
||||
# Set up device
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
print(f"Using device: {device}")
|
||||
|
||||
# Load model
|
||||
model = GarageDoorCNN(resize_dim=RESIZE_DIM)
|
||||
model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
|
||||
model.to(device)
|
||||
model.eval()
|
||||
|
||||
# Define image transforms
|
||||
data_transform = transforms.Compose([
|
||||
CropLowerRightTriangle(triangle_width=TRIANGLE_CROP_WIDTH, triangle_height=TRIANGLE_CROP_HEIGHT),
|
||||
transforms.Resize((RESIZE_DIM, RESIZE_DIM)),
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
||||
])
|
||||
|
||||
# --- Process Images ---
|
||||
print(f"Scanning images in {SOURCE_DIR}...")
|
||||
with torch.no_grad():
|
||||
for filename in os.listdir(SOURCE_DIR):
|
||||
file_path = os.path.join(SOURCE_DIR, filename)
|
||||
if os.path.isfile(file_path):
|
||||
try:
|
||||
image = Image.open(file_path).convert('RGB')
|
||||
|
||||
# Apply transformations
|
||||
input_tensor = data_transform(image)
|
||||
input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model
|
||||
input_batch = input_batch.to(device)
|
||||
|
||||
# Get model output
|
||||
output = model(input_batch)
|
||||
|
||||
# Get probabilities and prediction
|
||||
probabilities = F.softmax(output, dim=1)
|
||||
confidence, pred_idx = torch.max(probabilities, 1)
|
||||
|
||||
if pred_idx.item() == TARGET_CLASS_IDX:
|
||||
print(f"Found 'open' image: {file_path} with confidence: {confidence.item():.4f}")
|
||||
# Copy file
|
||||
shutil.copy(file_path, os.path.join(DEST_DIR, filename))
|
||||
|
||||
except Exception as e:
|
||||
print(f"Could not process file {file_path}: {e}")
|
||||
|
||||
print("Sorting complete.")
|
||||
|
||||
if __name__ == '__main__':
|
||||
if not os.path.exists('garage_door_cnn.pth'):
|
||||
print("Error: Model file 'garage_door_cnn.pth' not found. Please run train.py first.")
|
||||
elif not os.path.isdir('data/hourly_photos'):
|
||||
print("Error: Source directory 'data/hourly_photos' not found.")
|
||||
else:
|
||||
sort_images()
|
||||
|
69
train.py
69
train.py
@@ -6,75 +6,8 @@ from torchvision import datasets, transforms
|
||||
from PIL import Image, ImageDraw
|
||||
import os
|
||||
|
||||
# Custom transform to crop a triangle from the lower right corner
|
||||
class CropLowerRightTriangle(object):
|
||||
"""
|
||||
Crops a rectangular area from the lower right corner of an image,
|
||||
then masks it to a triangle.
|
||||
The user can adjust the geometry of the triangle.
|
||||
"""
|
||||
def __init__(self, triangle_width, triangle_height):
|
||||
self.triangle_width = triangle_width
|
||||
self.triangle_height = triangle_height
|
||||
from model import CropLowerRightTriangle, GarageDoorCNN
|
||||
|
||||
def __call__(self, img):
|
||||
img_width, img_height = img.size
|
||||
|
||||
# Define the bounding box for the crop
|
||||
left = img_width - self.triangle_width
|
||||
top = img_height - self.triangle_height
|
||||
right = img_width
|
||||
bottom = img_height
|
||||
|
||||
# Crop a rectangle from the lower right corner
|
||||
cropped_img = img.crop((left, top, right, bottom))
|
||||
|
||||
# Create a triangular mask. The mask is the same size as the cropped rectangle.
|
||||
mask = Image.new('L', (self.triangle_width, self.triangle_height), 0)
|
||||
# The polygon vertices define the lower-right triangle within the rectangle.
|
||||
# Vertices are (top-right, bottom-left, bottom-right).
|
||||
polygon = [(self.triangle_width, 0), (0, self.triangle_height), (self.triangle_width, self.triangle_height)]
|
||||
ImageDraw.Draw(mask).polygon(polygon, fill=255)
|
||||
|
||||
# Create a black background image.
|
||||
background = Image.new("RGB", cropped_img.size, (0, 0, 0))
|
||||
|
||||
# Paste the original cropped image onto the background using the mask.
|
||||
# Where the mask is white, the image is pasted. Where black, it's not.
|
||||
background.paste(cropped_img, (0, 0), mask)
|
||||
|
||||
return background
|
||||
|
||||
# Define the CNN
|
||||
class GarageDoorCNN(nn.Module):
|
||||
def __init__(self, resize_dim=64):
|
||||
super(GarageDoorCNN, self).__init__()
|
||||
self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
|
||||
self.relu1 = nn.ReLU()
|
||||
self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
|
||||
self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
|
||||
self.relu2 = nn.ReLU()
|
||||
self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
|
||||
self.conv3 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
|
||||
self.relu3 = nn.ReLU()
|
||||
self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
|
||||
|
||||
# Calculate the size of the flattened features after convolutions and pooling
|
||||
final_dim = resize_dim // (2**3) # 3 pooling layers with stride 2
|
||||
self.fc1_input_features = 64 * final_dim * final_dim
|
||||
|
||||
self.fc1 = nn.Linear(self.fc1_input_features, 512)
|
||||
self.relu4 = nn.ReLU()
|
||||
self.fc2 = nn.Linear(512, 2) # 2 classes: open, closed
|
||||
|
||||
def forward(self, x):
|
||||
x = self.pool1(self.relu1(self.conv1(x)))
|
||||
x = self.pool2(self.relu2(self.conv2(x)))
|
||||
x = self.pool3(self.relu3(self.conv3(x)))
|
||||
x = x.view(-1, self.fc1_input_features) # Flatten the tensor
|
||||
x = self.relu4(self.fc1(x))
|
||||
x = self.fc2(x)
|
||||
return x
|
||||
|
||||
def train_model():
|
||||
# --- Hyperparameters and Configuration ---
|
||||
|
Reference in New Issue
Block a user