| #include "Matrix.h"
#include <thread>
#include <vector>
template<typename T>
std::mutex Matrix<T>::index_mutex;
template<typename T>
int Matrix<T>::next_row = 0;
template<typename T>
Matrix<T>::Matrix(int n, int m) : rows(n), cols(m), data(n, std::vector<T>(m)) {}
template<typename T>
int Matrix<T>::getRows() const {
    return rows;
}
template<typename T>
int Matrix<T>::getCols() const {
    return cols;
}
template<typename T>
std::vector<std::vector<T>> Matrix<T>::getData() const {
    return data;
}
template<typename T>
void Matrix<T>::initialize(const std::vector<std::vector<T>>& value) {
    rows = value.size();
    cols = (rows > 0) ? value[0].size() : 0;
    data = value;
}
template<typename T>
void Matrix<T>::print() const {
    for (int i = 0; i < rows; ++i) {
        std::cout << '[';
        for (int j = 0; j < cols; ++j) {
            std::cout << data[i][j] << (j == (cols - 1) ? ' ' : ',');
        }
        std::cout << ']' << std::endl;
    }
}
template<typename T>
void* Matrix<T>::threadFunction(void* arg) {
    ThreadData* data = static_cast<ThreadData*>(arg);
    for (int i = data->start_row; i < data->end_row; ++i) {
        for (int j = 0; j < data->other->cols; ++j) {
            for (int k = 0; k < data->self->cols; ++k) {
                data->result->data[i][j] += data->self->data[i][k] * data->other->data[k][j];
            }
        }
    }
    return nullptr;
}
template<typename T>
Matrix<T> Matrix<T>::add(const Matrix<T>& other) const {
    if (this->rows != other.getRows() || this->cols != other.getCols()) {
        throw std::invalid_argument("Matrices dimensions must be equal for addition");
    }
    Matrix<T> result(this->rows, this->cols);
    const size_t SmallMatrixThreshold = 10000;  // Threshold for small matrices
    size_t totalElements = this->rows * this->cols;
    if (totalElements < SmallMatrixThreshold) {
        // Single-threaded addition
        for (size_t i = 0; i < this->rows; ++i) {
            for (size_t j = 0; j < this->cols; ++j) {
                result.data[i][j] = this->data[i][j] + other.data[i][j];
            }
        }
    } else {
        // Multi-threaded addition
        size_t hardwareThreads = 2;//std::thread::hardware_concurrency();
        std::cout << std::endl << "threads: " << hardwareThreads << " count" << std::endl;
        size_t rowsPerThread = this->rows / hardwareThreads;
        std::vector<std::thread> threads;
        for (size_t t = 0; t < hardwareThreads; ++t) {
            size_t startRow = t * rowsPerThread;
            size_t endRow = (t == hardwareThreads - 1) ? this->rows : startRow + rowsPerThread;
            threads.emplace_back([&, startRow, endRow]() {
                for (size_t i = startRow; i < endRow; ++i) {
                    for (size_t j = 0; j < this->cols; ++j) {
                        result.data[i][j] = this->data[i][j] + other.data[i][j];
                    }
                }
            });
        }
        for (auto& t : threads) {
            t.join();
        }
    }
    return result;
}
template<typename T>
Matrix<T> Matrix<T>::dot(const Matrix<T>& other) const {
    if (this->cols != other.getRows()) {
        throw std::invalid_argument("Matrix dimensions do not allow multiplication");
    }
    Matrix<T> result(this->rows, other.getCols());
    int totalWorkload = this->rows * other.getCols();
    int hardwareThreads = std::thread::hardware_concurrency() / 2;
    int workloadPerThread = std::max(1, totalWorkload / hardwareThreads);
    int threadCount = std::min(hardwareThreads, (totalWorkload + workloadPerThread - 1) / workloadPerThread);
    std::vector<std::thread> threads(threadCount);
    std::atomic<int> next_workload(0);
    for (int t = 0; t < threadCount; ++t) {
        threads[t] = std::thread([&]() {
            while (true) {
                int start_index = next_workload.fetch_add(workloadPerThread);
                if (start_index >= totalWorkload) break;
                int end_index = std::min(start_index + workloadPerThread, totalWorkload);
                for (int index = start_index; index < end_index; ++index) {
                    int row = index / other.getCols();
                    int col = index % other.getCols();
                    T sum = 0;
                    for (int k = 0; k < this->cols; ++k) {
                        sum += this->data[row][k] * other.data[k][col];
                    }
                    result.data[row][col] = sum;
                }
            }
        });
    }
    for (auto& t : threads) {
        t.join();
    }
    return result;
}
template<typename T>
Matrix<T> Matrix<T>::operator*(const Matrix<T>& other) const {
    return dot(other);
}
// Explicit instantiation for int and float
template class Matrix<int>;
template class Matrix<float>;
template class Matrix<double>;
 |