Aggregation vs Joins: Methodology

I promised to outline my methodology for Oracle vs MySQL vs SQL Server: Aggregation vs Joins so here it is.

Oracle

Version: Oracle 10g Express Edition ("XE") running on Windows XP SP3

CREATE TABLE Emp (
  ID NUMBER(19,0) PRIMARY KEY,
  PersonID NUMBER(19,0),
  CompanyID NUMBER(19,0)
);
CREATE SEQUENCE emp_seq START WITH 1;
CREATE INDEX idx1 ON Emp (PersonID, CompanyID);
CREATE INDEX idx2 ON Emp (CompanyID, PersonID);

MySQL

Version: MySQL 5.0.49a (MyISAM) running on Windows XP SP3

CREATE TABLE Emp (
  ID INT(19) AUTO_INCREMENT PRIMARY KEY,
  PersonID INT(19),
  CompanyID INT(19)
);
CREATE INDEX idx1 ON Emp (PersonID, CompanyID);
CREATE INDEX idx2 ON Emp (CompanyID, PersonID);

SQL Server

Version: SQL Server running on Windows XP SP3

CREATE TABLE Emp (
  ID NUMERIC(19,0) IDENTITY PRIMARY KEY,
  PersonID NUMERIC(19,0),
  CompanyID NUMERIC(19,0)
);
CREATE INDEX idx1 ON Emp (PersonID, CompanyID);
CREATE INDEX idx2 ON Emp (CompanyID, PersonID);

Join Query

SELECT e1.personid
FROM emp e1
JOIN emp e2 ON e1.personid = e2.personid AND e2.companyid = 80
JOIN emp e3 ON e2.personid = e3.personid AND e3.companyid = 95
JOIN emp e4 ON e3.personid = e4.personid AND e4.companyid = 98
WHERE e1.companyid = 99

Aggregation Query

SELECT personid
FROM emp
WHERE companyid IN (80,95,98,99)
GROUP BY personid
HAVING COUNT(1) = 4

Emp.java

package com.cforcoding;

public class Emp {
    private Long id;
    private long personId;
    private long companyId;

    public Emp(long personId, long companyId) { this.personId = personId; this.companyId = companyId; }
    public Long getId() { return id; }
    public void setId(Long id) { this.id = id; } 
    public long getPersonId() { return personId; }
    public void setPersonId(long personId) { this.personId = personId; }
    public long getCompanyId() { return companyId; }
    public void setCompanyId(long companyId) { this.companyId = companyId; }
}

Emp.xml

Note: Ibatis was used to create the sample data. See Spring and Ibatis Tutorial for detailed setup instructions.

<insert id="insertEmp_sequence" parameterClass="com.cforcoding.Emp">
    <selectKey keyProperty="id" resultClass="long">
        SELECT emp_seq.NEXTVAL FROM DUAL
    </selectKey>
    INSERT INTO emp (id, person_id, company_id) VALUES (#id#, #personId#, #companyId#)
</insert>

<insert id="insertEmp_auto" parameterClass="com.cforcoding.Emp">
    INSERT INTO Emp (PersonID, CompanyID) VALUES (#personId#, #companyId#)
    <selectKey keyProperty="id" resultClass="long">
        SELECT @@IDENTITY
    </selectKey>
</insert>

The first query is used for Oracle. The second one for SQL Server and MySQL. The obvious DAO class has been skipped.

Create.java

private final static int BATCH_SIZE = 1000;
private final static List<Integer> companies = new ArrayList<Integer>();
private final static Random r = new Random(167234987609003358L);
private static PartyDAO partyDAO = null;

private static void runTest() {
    seedCompanies(100);
    long start = System.currentTimeMillis();
    for (int i=0; i<100; i++) {
        createEmpBatch(i * BATCH_SIZE);
        long now = System.currentTimeMillis();
        double duration = (now - start) / 1000.0D;
        System.out.printf("[+%,3f] Iteration %d%n", duration, i);
    }
}

private static void seedCompanies(int count) {
    double factor = 0.6d;
    double n = 3.0d;
    for (int i=0; i<count; i++) {
        int weight = (int) n;
        for (int j = 0; j < weight; j++) {
            companies.add(i);
        }
        n *= (1.0d + factor);
        if (factor > 0.01d) {
            factor *= 0.94d;
        }
    }
}

@Transactional
private static void createEmpBatch(int start) {
    for (int i=0; i<BATCH_SIZE; i++) {
        long person = start + i;
        int count = r.nextInt(3) + r.nextInt(3) + r.nextInt(5);
        Set<Long> employers = new HashSet<Long>();
        for (int j=0; j<count; j++) {
            long company;
            do {
                company = companies.get(r.nextInt(companies.size()));
            } while (employers.contains(company));
            employers.add(company);
            Emp e = new Emp(person, company);
            partyDAO.createEmp(e);
        }
    }
}

Basically what this does is:

  • Create a list of 100 companies;
  • Creates a weighted random table for the companies. The higher the company number, the more likely it is to appear. This is why the numbers in the queries above are also quite high
  • This generates roughly four million records; and
  • With the same random seed you can create the same dataset in each database.

If you see any issues, please let me know.

0 comments:

Post a Comment