Skip to content

Database (PostgreSQL)

DealAI.lt uses PostgreSQL 13+ as its primary relational database management system. PostgreSQL was chosen for its superior performance with complex queries, excellent JSON support, and robust data integrity features.

Server Details:

  • Host: 162.55.174.116
  • Port: 5432
  • Database: dealai_products
  • Version: PostgreSQL 13+

Purpose: Main product catalog storing all aggregated product data

Schema:

CREATE TABLE product (
id SERIAL PRIMARY KEY,
external_id VARCHAR(255) UNIQUE,
title TEXT NOT NULL,
brand VARCHAR(255),
price NUMERIC(10, 2),
list_price NUMERIC(10, 2),
discount_percentage NUMERIC(5, 2),
availability VARCHAR(50),
description TEXT,
specifications JSONB,
image_urls JSONB,
category_id INTEGER,
site_id INTEGER,
product_url TEXT,
sku VARCHAR(255),
ean VARCHAR(50),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
last_crawled_at TIMESTAMP,
added_search BOOLEAN DEFAULT FALSE,
is_active BOOLEAN DEFAULT TRUE
);

Key Columns:

  • external_id - Unique identifier from source site
  • title - Product name/title
  • brand - Product brand/manufacturer
  • price - Current selling price
  • list_price - Original/list price
  • specifications - JSONB column for flexible attributes
  • image_urls - Array of image URLs in JSON
  • added_search - Flag for Elasticsearch sync status
  • last_crawled_at - Last scraping timestamp

Indexes:

CREATE INDEX idx_product_brand ON product(brand);
CREATE INDEX idx_product_category ON product(category_id);
CREATE INDEX idx_product_site ON product(site_id);
CREATE INDEX idx_product_added_search ON product(added_search);
CREATE INDEX idx_product_updated ON product(updated_at DESC);
CREATE INDEX idx_product_specs ON product USING GIN(specifications);

Purpose: Hierarchical category structure for product classification

Schema:

CREATE TABLE core_category (
id SERIAL PRIMARY KEY,
name VARCHAR(255) NOT NULL,
slug VARCHAR(255) UNIQUE,
parent_id INTEGER REFERENCES core_category(id),
category_url TEXT,
site_id INTEGER,
product_count INTEGER DEFAULT 0,
level INTEGER DEFAULT 0,
path VARCHAR(500),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

Key Features:

  • Self-referencing for parent-child relationships
  • path for efficient tree traversal
  • level for depth tracking
  • product_count for denormalized counts

Indexes:

CREATE INDEX idx_category_parent ON core_category(parent_id);
CREATE INDEX idx_category_slug ON core_category(slug);
CREATE INDEX idx_category_path ON core_category(path);

Purpose: Time-series data for price and availability tracking

Schema:

CREATE TABLE product_crawl_history (
id BIGSERIAL PRIMARY KEY,
product_id INTEGER REFERENCES product(id) ON DELETE CASCADE,
title TEXT,
brand VARCHAR(255),
price NUMERIC(10, 2),
list_price NUMERIC(10, 2),
availability VARCHAR(50),
crawled_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
changes JSONB
);

Key Features:

  • Tracks every price/availability change
  • Historical trend analysis
  • Change detection via JSONB column

Indexes:

CREATE INDEX idx_history_product ON product_crawl_history(product_id);
CREATE INDEX idx_history_crawled ON product_crawl_history(crawled_at DESC);
CREATE INDEX idx_history_product_time ON product_crawl_history(product_id, crawled_at DESC);

Purpose: Screenshot metadata and storage

Schema:

CREATE TABLE product_screenshot (
id SERIAL PRIMARY KEY,
product_id INTEGER REFERENCES product(id) ON DELETE CASCADE,
screenshot_url TEXT NOT NULL,
thumbnail_url TEXT,
file_size INTEGER,
width INTEGER,
height INTEGER,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

Purpose: Scraping job queue and status tracking

Schema:

CREATE TABLE category_product_crawl (
id SERIAL PRIMARY KEY,
category_id INTEGER REFERENCES core_category(id),
category_url TEXT NOT NULL,
started BOOLEAN DEFAULT FALSE,
complete BOOLEAN DEFAULT FALSE,
products_found INTEGER DEFAULT 0,
job_id VARCHAR(255),
spider_name VARCHAR(100),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
started_at TIMESTAMP,
completed_at TIMESTAMP,
error_message TEXT
);

File: /inc/products-db.php

get_db_connection():

function get_db_connection() {
static $connection = null;
if ($connection === null) {
$connection = pg_connect(
"host=" . DB_HOST .
" port=" . DB_PORT .
" dbname=" . DB_NAME .
" user=" . DB_USER .
" password=" . DB_PASSWORD
);
if (!$connection) {
error_log("Database connection failed");
return false;
}
}
return $connection;
}

get_products_paginated():

function get_products_paginated($page = 1, $per_page = 50, $filters = []) {
$offset = ($page - 1) * $per_page;
$connection = get_db_connection();
$where = build_where_clause($filters);
$query = "
SELECT p.*, c.name as category_name
FROM product p
LEFT JOIN core_category c ON p.category_id = c.id
WHERE 1=1 $where
ORDER BY p.updated_at DESC
LIMIT $per_page OFFSET $offset
";
$result = pg_query($connection, $query);
return pg_fetch_all($result);
}

get_product_by_id():

function get_product_by_id($product_id) {
$connection = get_db_connection();
$query = "
SELECT p.*, c.name as category_name, c.path as category_path
FROM product p
LEFT JOIN core_category c ON p.category_id = c.id
WHERE p.id = $1
";
$result = pg_query_params($connection, $query, [$product_id]);
return pg_fetch_assoc($result);
}

get_category_tree():

function get_category_tree($parent_id = null) {
$connection = get_db_connection();
$query = "
WITH RECURSIVE category_tree AS (
SELECT id, name, parent_id, level, path, product_count
FROM core_category
WHERE parent_id IS NULL
UNION ALL
SELECT c.id, c.name, c.parent_id, c.level, c.path, c.product_count
FROM core_category c
INNER JOIN category_tree ct ON c.parent_id = ct.id
)
SELECT * FROM category_tree
ORDER BY path, level, name
";
$result = pg_query($connection, $query);
return pg_fetch_all($result);
}

Use Prepared Statements:

$result = pg_query_params(
$connection,
"SELECT * FROM product WHERE id = $1",
[$product_id]
);

Batch Processing:

function update_products_batch($products) {
$connection = get_db_connection();
pg_query($connection, "BEGIN");
foreach ($products as $product) {
$query = "UPDATE product SET title = $1, price = $2 WHERE id = $3";
pg_query_params($connection, $query, [
$product['title'],
$product['price'],
$product['id']
]);
}
pg_query($connection, "COMMIT");
}

B-Tree Indexes (default):

  • Primary keys
  • Foreign keys
  • Frequently filtered columns

GIN Indexes (JSONB):

CREATE INDEX idx_product_specifications ON product USING GIN(specifications);

Partial Indexes:

CREATE INDEX idx_active_products ON product(id) WHERE is_active = TRUE;
ALTER TABLE product
ADD CONSTRAINT fk_product_category
FOREIGN KEY (category_id) REFERENCES core_category(id) ON DELETE SET NULL;

Update Timestamp Trigger:

CREATE OR REPLACE FUNCTION update_updated_at()
RETURNS TRIGGER AS $$
BEGIN
NEW.updated_at = CURRENT_TIMESTAMP;
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
CREATE TRIGGER product_updated_at
BEFORE UPDATE ON product
FOR EACH ROW
EXECUTE FUNCTION update_updated_at();

Daily Full Backup:

Terminal window
pg_dump -h 162.55.174.116 -U dealai_user dealai_products > backup_$(date +%Y%m%d).sql

Continuous Archiving:

Terminal window
# Configure WAL archiving in postgresql.conf
archive_mode = on
archive_command = 'cp %p /backup/archive/%f'
Terminal window
psql -h 162.55.174.116 -U dealai_user -d dealai_products < backup_20230915.sql
SELECT * FROM pg_stat_activity WHERE datname = 'dealai_products';
-- Enable query logging
ALTER DATABASE dealai_products SET log_min_duration_statement = 1000;
-- View slow queries
SELECT query, calls, total_time, mean_time
FROM pg_stat_statements
ORDER BY mean_time DESC
LIMIT 20;
SELECT schemaname, tablename,
pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) AS size
FROM pg_tables
WHERE schemaname = 'public'
ORDER BY pg_total_relation_size(schemaname||'.'||tablename) DESC;
Terminal window
# Automated vacuum (configured in postgresql.conf)
autovacuum = on
# Manual vacuum
VACUUM ANALYZE product;
Terminal window
REINDEX TABLE product;
REINDEX DATABASE dealai_products;